diff --git a/CHANGELOG.md b/CHANGELOG.md index de3eac1..f2f776f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,19 @@ package _DOES NOT_ follow the PVP, or indeed any sensible version scheme, because NVIDIA are A-OK introducing breaking changes in minor updates. +## [0.12.8.0] - ??? +### Added + * Support for CUDA-12 + - Thanks to @noahmartinwilliams on GitHub for helping out! + +### Removed + * The following modules have been deprecated for a long time, and have + finally been removed in CUDA-12: + - `Foreign.CUDA.Driver.Texture` + - `Foreign.CUDA.Runtime.Texture` + Support for Texture Objects (their replacement) is missing in these + bindings so far. Contributions welcome. + ## [0.11.0.1] - 2023-08-15 ### Fixed * Build fixes for GHC 9.2 .. 9.6 diff --git a/README.md b/README.md index e14eff8..8af77b6 100644 --- a/README.md +++ b/README.md @@ -145,3 +145,6 @@ An incomplete list of missing bindings. Pull requests welcome! - cuGraphMemAllocNodeGetParams - cuGraphMemFreeNodeGetParams +### CUDA-12 + +A lot. PRs welcome. diff --git a/cbits/stubs.c b/cbits/stubs.c index 3b363ee..67f9280 100644 --- a/cbits/stubs.c +++ b/cbits/stubs.c @@ -22,17 +22,6 @@ cudaError_t cudaConfigureCall_simple(unsigned int gridX, unsigned int gridY, uns } #endif -CUresult cuTexRefSetAddress2D_simple(CUtexref tex, CUarray_format format, unsigned int numChannels, CUdeviceptr dptr, size_t width, size_t height, size_t pitch) -{ - CUDA_ARRAY_DESCRIPTOR desc; - desc.Format = format; - desc.NumChannels = numChannels; - desc.Width = width; - desc.Height = height; - - return cuTexRefSetAddress2D(tex, &desc, dptr, pitch); -} - CUresult cuMemcpy2DHtoD(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int dstXInBytes, unsigned int dstY, void* srcHost, unsigned int srcPitch, unsigned int srcXInBytes, unsigned int srcY, unsigned int widthInBytes, unsigned int height) { CUDA_MEMCPY2D desc; @@ -284,11 +273,6 @@ CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) { return cuMemsetD32_v2(dstDevice, ui, N); } - -CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes) -{ - return cuTexRefSetAddress_v2(ByteOffset, hTexRef, dptr, bytes); -} #endif #if CUDA_VERSION >= 4000 diff --git a/cuda.cabal b/cuda.cabal index e401bda..d9d8ebd 100644 --- a/cuda.cabal +++ b/cuda.cabal @@ -1,7 +1,7 @@ cabal-version: 1.24 Name: cuda -Version: 0.11.0.1 +Version: 0.12.8.0 Synopsis: FFI binding to the CUDA interface for programming NVIDIA GPUs Description: The CUDA library provides a direct, general purpose C-like SPMD programming @@ -121,7 +121,6 @@ Library Foreign.CUDA.Driver.Module.Query Foreign.CUDA.Driver.Profiler Foreign.CUDA.Driver.Stream - Foreign.CUDA.Driver.Texture Foreign.CUDA.Driver.Unified Foreign.CUDA.Driver.Utils @@ -133,7 +132,6 @@ Library Foreign.CUDA.Runtime.Exec Foreign.CUDA.Runtime.Marshal Foreign.CUDA.Runtime.Stream - Foreign.CUDA.Runtime.Texture Foreign.CUDA.Runtime.Utils -- Extras @@ -151,6 +149,7 @@ Library build-depends: base >= 4.7 && < 5 , bytestring >= 0.10.4 + , containers , filepath >= 1.0 , template-haskell , uuid-types >= 1.0 diff --git a/src/Foreign/CUDA/Analysis/Device.chs b/src/Foreign/CUDA/Analysis/Device.chs index bfeeb4a..4a44e8b 100644 --- a/src/Foreign/CUDA/Analysis/Device.chs +++ b/src/Foreign/CUDA/Analysis/Device.chs @@ -19,8 +19,12 @@ module Foreign.CUDA.Analysis.Device ( #include "cbits/stubs.h" +import qualified Data.Set as Set +import Data.Set (Set) import Data.Int +import Data.IORef import Text.Show.Describe +import System.IO.Unsafe import Debug.Trace @@ -179,7 +183,17 @@ data DeviceResources = DeviceResources deviceResources :: DeviceProperties -> DeviceResources deviceResources = resources . computeCapability where - -- This is mostly extracted from tables in the CUDA occupancy calculator. + -- Sources: + -- [1] https://github.com/NVIDIA/cuda-samples/blob/7b60178984e96bc09d066077d5455df71fee2a9f/Common/helper_cuda.h + -- - for: coresPerMP (line 643 _ConvertSMVer2Cores) + -- - for: architecture names (line 695 _ConvertSMVer2ArchName) + -- [2] https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability + -- - for: maxGridsPerDevice + -- - archived here: https://web.archive.org/web/20250409220108/https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability + -- - reproduced here: https://en.wikipedia.org/w/index.php?title=CUDA&oldid=1285775690#Technical_specification (note: link to specific page version) + -- [3] NVidia Nsight Compute + -- - for: the other fields + -- - left top "Start Activity" -> "Occupancy Calculator" -> "Launch"; tab "GPU Data" -- resources compute = case compute of Compute 1 0 -> resources (Compute 1 1) -- Tesla G80 @@ -283,7 +297,7 @@ deviceResources = resources . computeCapability } Compute 5 2 -> (resources (Compute 5 0)) -- Maxwell GM20x { sharedMemPerMP = 98304 - , maxRegPerBlock = 32768 + , maxRegPerBlock = 32768 -- value from [3], wrong in [2]? , warpAllocUnit = 2 } Compute 5 3 -> (resources (Compute 5 0)) -- Maxwell GM20B @@ -318,9 +332,15 @@ deviceResources = resources . computeCapability } Compute 6 2 -> (resources (Compute 6 0)) -- Pascal GP10B { coresPerMP = 128 - , warpsPerMP = 128 - , threadBlocksPerMP = 4096 - , maxRegPerBlock = 32768 + -- Commit 4f75ea889c2ade2bd3eab377b51bb5bbd28bfbae changed warpsPerMP + -- to 128, but [2] and [3] say 64 like CC 6.0; reverted back to 64 to + -- match NVIDIA documentation. + -- That commit also changed threadsPerMP (later mistakenly translated + -- to threadBlocksPerMP in 9df19adec8efc9df761deab40cf04d27810d97d3) + -- from 2048 to 4096, but again [2] and [3] retain 2048 so we keep it + -- at that. + , warpsPerMP = 64 + , maxRegPerBlock = 32768 -- value from [2], wrong in [3]? , warpAllocUnit = 4 , maxGridsPerDevice = 16 } @@ -346,7 +366,7 @@ deviceResources = resources . computeCapability Compute 7 2 -> (resources (Compute 7 0)) -- Volta GV10B { maxGridsPerDevice = 16 - , maxSharedMemPerBlock = 49152 + , maxSharedMemPerBlock = 49152 -- unsure why this is here; [2] and [3] say still 98304 } Compute 7 5 -> (resources (Compute 7 0)) -- Turing TU1xx @@ -376,14 +396,91 @@ deviceResources = resources . computeCapability , warpRegAllocUnit = 256 , maxGridsPerDevice = 128 } - Compute 8 6 -> (resources (Compute 8 0)) -- Ampere GA102 - { warpsPerMP = 48 + { coresPerMP = 128 + , warpsPerMP = 48 , threadsPerMP = 1536 , threadBlocksPerMP = 16 , sharedMemPerMP = 102400 , maxSharedMemPerBlock = 102400 } + Compute 8 7 -> (resources (Compute 8 0)) -- Ampere + { coresPerMP = 128 + , warpsPerMP = 48 + , threadsPerMP = 1536 + , threadBlocksPerMP = 16 + } + Compute 8 9 -> (resources (Compute 8 0)) -- Ada + { coresPerMP = 128 + , warpsPerMP = 48 + , threadsPerMP = 1536 + , threadBlocksPerMP = 24 + , sharedMemPerMP = 102400 + , maxSharedMemPerBlock = 102400 + } + + Compute 9 0 -> DeviceResources -- Hopper + { threadsPerWarp = 32 + , coresPerMP = 128 + , warpsPerMP = 64 + , threadsPerMP = 2048 + , threadBlocksPerMP = 32 + , sharedMemPerMP = 233472 + , maxSharedMemPerBlock = 233472 + , regFileSizePerMP = 65536 + , maxRegPerBlock = 65536 + , regAllocUnit = 256 + , regAllocationStyle = Warp + , maxRegPerThread = 255 + , sharedMemAllocUnit = 128 + , warpAllocUnit = 4 + , warpRegAllocUnit = 256 + , maxGridsPerDevice = 128 + } + + Compute 10 0 -> DeviceResources -- Blackwell + { threadsPerWarp = 32 + , coresPerMP = 128 + , warpsPerMP = 64 + , threadsPerMP = 2048 + , threadBlocksPerMP = 32 + , sharedMemPerMP = 233472 + , maxSharedMemPerBlock = 233472 + , regFileSizePerMP = 65536 + , maxRegPerBlock = 65536 + , regAllocUnit = 256 + , regAllocationStyle = Warp + , maxRegPerThread = 255 + , sharedMemAllocUnit = 128 + , warpAllocUnit = 4 + , warpRegAllocUnit = 256 + , maxGridsPerDevice = 128 + } + Compute 10 1 -> (resources (Compute 10 0)) -- Blackwell + { warpsPerMP = 48 + , threadsPerMP = 1536 + , threadBlocksPerMP = 24 + } + + Compute 12 0 -> DeviceResources -- Blackwell + { threadsPerWarp = 32 + , coresPerMP = 128 + , warpsPerMP = 48 + , threadsPerMP = 1536 + , threadBlocksPerMP = 24 + , sharedMemPerMP = 102400 + , maxSharedMemPerBlock = 102400 + , regFileSizePerMP = 65536 + , maxRegPerBlock = 65536 + , regAllocUnit = 256 + , regAllocationStyle = Warp + , maxRegPerThread = 255 + , sharedMemAllocUnit = 128 + , warpAllocUnit = 4 + , warpRegAllocUnit = 256 + , maxGridsPerDevice = 128 + } + -- Something might have gone wrong, or the library just needs to be -- updated for the next generation of hardware, in which case we just want @@ -393,7 +490,30 @@ deviceResources = resources . computeCapability -- However, it should be OK because all library functions run in IO, so it -- is likely the user code is as well. -- - _ -> trace warning $ resources (Compute 6 0) - where warning = unlines [ "*** Warning: Unknown CUDA device compute capability: " ++ show compute - , "*** Please submit a bug report at https://github.com/tmcdonell/cuda/issues" ] - + _ -> case warningForCC compute of + Just warning -> trace warning defaultResources + Nothing -> defaultResources + + defaultResources = resources (Compute 6 0) + + -- All this logic is to ensure the warning is only shown once per unknown + -- compute capability. This sounds not worth it, but in practice, it is: + -- empirically, an unknown compute capability often leads to /screenfuls/ + -- of warnings in accelerate-llvm-ptx otherwise. + {-# NOINLINE warningForCC #-} + warningForCC :: Compute -> Maybe String + warningForCC compute = unsafePerformIO $ do + unseen <- atomicModifyIORef' warningShown $ \seen -> + -- This is just one tree traversal; lookup-insert would be two traversals. + let seen' = Set.insert compute seen + in (seen', Set.size seen' > Set.size seen) + return $ if unseen + then Just $ unlines + [ "*** Warning: Unknown CUDA device compute capability: " ++ show compute + , "*** Please submit a bug report at https://github.com/tmcdonell/cuda/issues" + , "*** (This warning will only be shown once for this compute capability)" ] + else Nothing + + {-# NOINLINE warningShown #-} + warningShown :: IORef (Set Compute) + warningShown = unsafePerformIO $ newIORef mempty diff --git a/src/Foreign/CUDA/Analysis/Occupancy.hs b/src/Foreign/CUDA/Analysis/Occupancy.hs index b7a0ea9..6a6cb8f 100644 --- a/src/Foreign/CUDA/Analysis/Occupancy.hs +++ b/src/Foreign/CUDA/Analysis/Occupancy.hs @@ -28,6 +28,14 @@ -- the number in the @.cubin@ file to the amount you dynamically allocate at run -- time to get the correct shared memory usage. -- +-- __Warning__: Like the official Occupancy Calculator in NVidia Nsight +-- Compute, the calculator in this module does not support or consider Thread +-- Block Clusters +-- () +-- that have been introduced with compute capability 9.0 (Hopper). If you use +-- thread block clusters in your kernels, the results you get with the +-- functions in this module may not be accurate. Profile and measure. +-- -- /Notes About Occupancy/ -- -- Higher occupancy does not necessarily mean higher performance. If a kernel diff --git a/src/Foreign/CUDA/Driver/Graph/Capture.chs b/src/Foreign/CUDA/Driver/Graph/Capture.chs index 8876542..160ff70 100644 --- a/src/Foreign/CUDA/Driver/Graph/Capture.chs +++ b/src/Foreign/CUDA/Driver/Graph/Capture.chs @@ -152,13 +152,23 @@ status = requireSDK 'status 10.0 #if CUDA_VERSION < 10010 info :: Stream -> IO (Status, Int64) info = requireSDK 'info 10.1 -#else +#elif CUDA_VERSION < 12000 {# fun unsafe cuStreamGetCaptureInfo as info { useStream `Stream' , alloca- `Status' peekEnum* , alloca- `Int64' peekIntConv* } -> `()' checkStatus*- #} +#else +{# fun unsafe cuStreamGetCaptureInfo_v2 as info + { useStream `Stream' + , alloca- `Status' peekEnum* + , alloca- `Int64' peekIntConv* + , alloca- `Graph' + , alloca- `Node' + , alloca- `CSize' + } + -> `()' checkStatus*- #} #endif diff --git a/src/Foreign/CUDA/Driver/Module/Query.chs b/src/Foreign/CUDA/Driver/Module/Query.chs index 0998b3a..e484d8e 100644 --- a/src/Foreign/CUDA/Driver/Module/Query.chs +++ b/src/Foreign/CUDA/Driver/Module/Query.chs @@ -16,7 +16,7 @@ module Foreign.CUDA.Driver.Module.Query ( -- ** Querying module inhabitants - getFun, getPtr, getTex, + getFun, getPtr, ) where @@ -28,7 +28,6 @@ import Foreign.CUDA.Driver.Error import Foreign.CUDA.Driver.Exec import Foreign.CUDA.Driver.Marshal ( peekDeviceHandle ) import Foreign.CUDA.Driver.Module.Base -import Foreign.CUDA.Driver.Texture import Foreign.CUDA.Internal.C2HS import Foreign.CUDA.Ptr @@ -92,26 +91,6 @@ getPtr !mdl !name = do -> `Status' cToEnum #} --- | --- Return a handle to a texture reference. This texture reference handle --- should not be destroyed, as the texture will be destroyed automatically --- when the module is unloaded. --- --- --- -{-# INLINEABLE getTex #-} -getTex :: Module -> ShortByteString -> IO Texture -getTex !mdl !name = resultIfFound "texture" name =<< cuModuleGetTexRef mdl name - -{-# INLINE cuModuleGetTexRef #-} -{# fun unsafe cuModuleGetTexRef - { alloca- `Texture' peekTex* - , useModule `Module' - , useAsCString* `ShortByteString' - } - -> `Status' cToEnum #} - - -------------------------------------------------------------------------------- -- Internal -------------------------------------------------------------------------------- diff --git a/src/Foreign/CUDA/Driver/Stream.chs b/src/Foreign/CUDA/Driver/Stream.chs index b85dc9a..70f2034 100644 --- a/src/Foreign/CUDA/Driver/Stream.chs +++ b/src/Foreign/CUDA/Driver/Stream.chs @@ -334,6 +334,7 @@ write32 _ _ _ _ = requireSDK 'write32 8.0 write32 ptr val stream flags = nothingIfOk =<< cuStreamWriteValue32 stream ptr val flags {-# INLINE cuStreamWriteValue32 #-} +#if CUDA_VERSION < 12000 {# fun unsafe cuStreamWriteValue32 { useStream `Stream' , useDeviceHandle `DevicePtr Word32' @@ -341,6 +342,15 @@ write32 ptr val stream flags = nothingIfOk =<< cuStreamWriteValue32 stream ptr v , combineBitMasks `[StreamWriteFlag]' } -> `Status' cToEnum #} +#else +{# fun unsafe cuStreamWriteValue32_v2 as cuStreamWriteValue32 + { useStream `Stream' + , useDeviceHandle `DevicePtr Word32' + , `Word32' + , combineBitMasks `[StreamWriteFlag]' + } + -> `Status' cToEnum #} +#endif #endif {-# INLINE write64 #-} @@ -351,6 +361,7 @@ write64 _ _ _ _ = requireSDK 'write64 9.0 write64 ptr val stream flags = nothingIfOk =<< cuStreamWriteValue64 stream ptr val flags {-# INLINE cuStreamWriteValue64 #-} +#if CUDA_VERSION < 12000 {# fun unsafe cuStreamWriteValue64 { useStream `Stream' , useDeviceHandle `DevicePtr Word64' @@ -358,6 +369,15 @@ write64 ptr val stream flags = nothingIfOk =<< cuStreamWriteValue64 stream ptr v , combineBitMasks `[StreamWriteFlag]' } -> `Status' cToEnum #} +#else +{# fun unsafe cuStreamWriteValue64_v2 as cuStreamWriteValue64 + { useStream `Stream' + , useDeviceHandle `DevicePtr Word64' + , `Word64' + , combineBitMasks `[StreamWriteFlag]' + } + -> `Status' cToEnum #} +#endif #endif @@ -388,12 +408,21 @@ wait32 _ _ _ _ = requireSDK 'wait32 8.0 wait32 ptr val stream flags = nothingIfOk =<< cuStreamWaitValue32 stream ptr val flags {-# INLINE cuStreamWaitValue32 #-} +#if CUDA_VERSION < 12000 {# fun unsafe cuStreamWaitValue32 { useStream `Stream' , useDeviceHandle `DevicePtr Word32' , `Word32' , combineBitMasks `[StreamWaitFlag]' } -> `Status' cToEnum #} +#else +{# fun unsafe cuStreamWaitValue32_v2 as cuStreamWaitValue32 + { useStream `Stream' + , useDeviceHandle `DevicePtr Word32' + , `Word32' + , combineBitMasks `[StreamWaitFlag]' + } -> `Status' cToEnum #} +#endif #endif {-# INLINE wait64 #-} @@ -404,12 +433,21 @@ wait64 _ _ _ _ = requireSDK 'wait64 9.0 wait64 ptr val stream flags = nothingIfOk =<< cuStreamWaitValue64 stream ptr val flags {-# INLINE cuStreamWaitValue64 #-} +#if CUDA_VERSION < 12000 {# fun unsafe cuStreamWaitValue64 { useStream `Stream' , useDeviceHandle `DevicePtr Word64' , `Word64' , combineBitMasks `[StreamWaitFlag]' } -> `Status' cToEnum #} +#else +{# fun unsafe cuStreamWaitValue64_v2 as cuStreamWaitValue64 + { useStream `Stream' + , useDeviceHandle `DevicePtr Word64' + , `Word64' + , combineBitMasks `[StreamWaitFlag]' + } -> `Status' cToEnum #} +#endif #endif diff --git a/src/Foreign/CUDA/Driver/Texture.chs b/src/Foreign/CUDA/Driver/Texture.chs deleted file mode 100644 index c06a768..0000000 --- a/src/Foreign/CUDA/Driver/Texture.chs +++ /dev/null @@ -1,308 +0,0 @@ -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE ForeignFunctionInterface #-} -{-# OPTIONS_HADDOCK prune #-} --------------------------------------------------------------------------------- --- | --- Module : Foreign.CUDA.Driver.Texture --- Copyright : [2009..2023] Trevor L. McDonell --- License : BSD --- --- Texture management for low-level driver interface --- --------------------------------------------------------------------------------- - -module Foreign.CUDA.Driver.Texture ( - - -- * Texture Reference Management - Texture(..), Format(..), AddressMode(..), FilterMode(..), ReadMode(..), - bind, bind2D, - getAddressMode, getFilterMode, getFormat, - setAddressMode, setFilterMode, setFormat, setReadMode, - - -- Deprecated - create, destroy, - - -- Internal - peekTex - -) where - -#include "cbits/stubs.h" -{# context lib="cuda" #} - --- Friends -import Foreign.CUDA.Ptr -import Foreign.CUDA.Driver.Error -import Foreign.CUDA.Driver.Marshal -import Foreign.CUDA.Internal.C2HS - --- System -import Foreign -import Foreign.C -import Control.Monad - -#if CUDA_VERSION >= 3020 -{-# DEPRECATED create, destroy "as of CUDA version 3.2" #-} -#endif - - --------------------------------------------------------------------------------- --- Data Types --------------------------------------------------------------------------------- - --- | --- A texture reference --- -newtype Texture = Texture { useTexture :: {# type CUtexref #}} - deriving (Eq, Show) - -instance Storable Texture where - sizeOf _ = sizeOf (undefined :: {# type CUtexref #}) - alignment _ = alignment (undefined :: {# type CUtexref #}) - peek p = Texture `fmap` peek (castPtr p) - poke p t = poke (castPtr p) (useTexture t) - --- | --- Texture reference addressing modes --- -{# enum CUaddress_mode as AddressMode - { underscoreToCase } - with prefix="CU_TR_ADDRESS_MODE" deriving (Eq, Show) #} - --- | --- Texture reference filtering mode --- -{# enum CUfilter_mode as FilterMode - { underscoreToCase } - with prefix="CU_TR_FILTER_MODE" deriving (Eq, Show) #} - --- | --- Texture read mode options --- -#c -typedef enum CUtexture_flag_enum { - CU_TEXTURE_FLAG_READ_AS_INTEGER = CU_TRSF_READ_AS_INTEGER, - CU_TEXTURE_FLAG_NORMALIZED_COORDINATES = CU_TRSF_NORMALIZED_COORDINATES, - CU_TEXTURE_FLAG_SRGB = CU_TRSF_SRGB -} CUtexture_flag; -#endc - -{# enum CUtexture_flag as ReadMode - { underscoreToCase - , CU_TEXTURE_FLAG_SRGB as SRGB } - with prefix="CU_TEXTURE_FLAG" deriving (Eq, Show) #} - --- | --- Texture data formats --- -{# enum CUarray_format as Format - { underscoreToCase - , UNSIGNED_INT8 as Word8 - , UNSIGNED_INT16 as Word16 - , UNSIGNED_INT32 as Word32 - , SIGNED_INT8 as Int8 - , SIGNED_INT16 as Int16 - , SIGNED_INT32 as Int32 } - with prefix="CU_AD_FORMAT" deriving (Eq, Show) #} - - --------------------------------------------------------------------------------- --- Texture management --------------------------------------------------------------------------------- - --- | --- Create a new texture reference. Once created, the application must call --- 'setPtr' to associate the reference with allocated memory. Other texture --- reference functions are used to specify the format and interpretation to be --- used when the memory is read through this reference. --- --- --- -{-# INLINEABLE create #-} -create :: IO Texture -create = resultIfOk =<< cuTexRefCreate - -{-# INLINE cuTexRefCreate #-} -{# fun unsafe cuTexRefCreate - { alloca- `Texture' peekTex* } -> `Status' cToEnum #} - - --- | --- Destroy a texture reference. --- --- --- -{-# INLINEABLE destroy #-} -destroy :: Texture -> IO () -destroy !tex = nothingIfOk =<< cuTexRefDestroy tex - -{-# INLINE cuTexRefDestroy #-} -{# fun unsafe cuTexRefDestroy - { useTexture `Texture' } -> `Status' cToEnum #} - - --- | --- Bind a linear array address of the given size (bytes) as a texture --- reference. Any previously bound references are unbound. --- --- --- -{-# INLINEABLE bind #-} -bind :: Texture -> DevicePtr a -> Int64 -> IO () -bind !tex !dptr !bytes = nothingIfOk =<< cuTexRefSetAddress tex dptr bytes - -{-# INLINE cuTexRefSetAddress #-} -{# fun unsafe cuTexRefSetAddress - { alloca- `Int' - , useTexture `Texture' - , useDeviceHandle `DevicePtr a' - , `Int64' } -> `Status' cToEnum #} - - --- | --- Bind a linear address range to the given texture reference as a --- two-dimensional arena. Any previously bound reference is unbound. Note that --- calls to 'setFormat' can not follow a call to 'bind2D' for the same texture --- reference. --- --- --- -{-# INLINEABLE bind2D #-} -bind2D :: Texture -> Format -> Int -> DevicePtr a -> (Int,Int) -> Int64 -> IO () -bind2D !tex !fmt !chn !dptr (!width,!height) !pitch = - nothingIfOk =<< cuTexRefSetAddress2D_simple tex fmt chn dptr width height pitch - -{-# INLINE cuTexRefSetAddress2D_simple #-} -{# fun unsafe cuTexRefSetAddress2D_simple - { useTexture `Texture' - , cFromEnum `Format' - , `Int' - , useDeviceHandle `DevicePtr a' - , `Int' - , `Int' - , `Int64' } -> `Status' cToEnum #} - - --- | --- Get the addressing mode used by a texture reference, corresponding to the --- given dimension (currently the only supported dimension values are 0 or 1). --- --- --- -{-# INLINEABLE getAddressMode #-} -getAddressMode :: Texture -> Int -> IO AddressMode -getAddressMode !tex !dim = resultIfOk =<< cuTexRefGetAddressMode tex dim - -{-# INLINE cuTexRefGetAddressMode #-} -{# fun unsafe cuTexRefGetAddressMode - { alloca- `AddressMode' peekEnum* - , useTexture `Texture' - , `Int' } -> `Status' cToEnum #} - - --- | --- Get the filtering mode used by a texture reference. --- --- --- -{-# INLINEABLE getFilterMode #-} -getFilterMode :: Texture -> IO FilterMode -getFilterMode !tex = resultIfOk =<< cuTexRefGetFilterMode tex - -{-# INLINE cuTexRefGetFilterMode #-} -{# fun unsafe cuTexRefGetFilterMode - { alloca- `FilterMode' peekEnum* - , useTexture `Texture' } -> `Status' cToEnum #} - - --- | --- Get the data format and number of channel components of the bound texture. --- --- --- -{-# INLINEABLE getFormat #-} -getFormat :: Texture -> IO (Format, Int) -getFormat !tex = do - (!status,!fmt,!dim) <- cuTexRefGetFormat tex - resultIfOk (status,(fmt,dim)) - -{-# INLINE cuTexRefGetFormat #-} -{# fun unsafe cuTexRefGetFormat - { alloca- `Format' peekEnum* - , alloca- `Int' peekIntConv* - , useTexture `Texture' } -> `Status' cToEnum #} - - --- | --- Specify the addressing mode for the given dimension of a texture reference. --- --- --- -{-# INLINEABLE setAddressMode #-} -setAddressMode :: Texture -> Int -> AddressMode -> IO () -setAddressMode !tex !dim !mode = nothingIfOk =<< cuTexRefSetAddressMode tex dim mode - -{-# INLINE cuTexRefSetAddressMode #-} -{# fun unsafe cuTexRefSetAddressMode - { useTexture `Texture' - , `Int' - , cFromEnum `AddressMode' } -> `Status' cToEnum #} - - --- | --- Specify the filtering mode to be used when reading memory through a texture --- reference. --- --- --- -{-# INLINEABLE setFilterMode #-} -setFilterMode :: Texture -> FilterMode -> IO () -setFilterMode !tex !mode = nothingIfOk =<< cuTexRefSetFilterMode tex mode - -{-# INLINE cuTexRefSetFilterMode #-} -{# fun unsafe cuTexRefSetFilterMode - { useTexture `Texture' - , cFromEnum `FilterMode' } -> `Status' cToEnum #} - - --- | --- Specify additional characteristics for reading and indexing the texture --- reference. --- --- --- -{-# INLINEABLE setReadMode #-} -setReadMode :: Texture -> ReadMode -> IO () -setReadMode !tex !mode = nothingIfOk =<< cuTexRefSetFlags tex mode - -{-# INLINE cuTexRefSetFlags #-} -{# fun unsafe cuTexRefSetFlags - { useTexture `Texture' - , cFromEnum `ReadMode' } -> `Status' cToEnum #} - - --- | --- Specify the format of the data and number of packed components per element to --- be read by the texture reference. --- --- --- -{-# INLINEABLE setFormat #-} -setFormat :: Texture -> Format -> Int -> IO () -setFormat !tex !fmt !chn = nothingIfOk =<< cuTexRefSetFormat tex fmt chn - -{-# INLINE cuTexRefSetFormat #-} -{# fun unsafe cuTexRefSetFormat - { useTexture `Texture' - , cFromEnum `Format' - , `Int' } -> `Status' cToEnum #} - - --------------------------------------------------------------------------------- --- Internal --------------------------------------------------------------------------------- - -{-# INLINE peekTex #-} -peekTex :: Ptr {# type CUtexref #} -> IO Texture -peekTex = liftM Texture . peek - diff --git a/src/Foreign/CUDA/Runtime/Device.chs b/src/Foreign/CUDA/Runtime/Device.chs index fa58f9e..90e2089 100644 --- a/src/Foreign/CUDA/Runtime/Device.chs +++ b/src/Foreign/CUDA/Runtime/Device.chs @@ -219,9 +219,15 @@ props :: Device -> IO DeviceProperties props !n = resultIfOk =<< cudaGetDeviceProperties n {-# INLINE cudaGetDeviceProperties #-} +#if CUDA_VERSION < 12000 {# fun unsafe cudaGetDeviceProperties { alloca- `DeviceProperties' peek* , `Int' } -> `Status' cToEnum #} +#else +{# fun unsafe cudaGetDeviceProperties_v2 as cudaGetDeviceProperties + { alloca- `DeviceProperties' peek* + , `Int' } -> `Status' cToEnum #} +#endif -- | diff --git a/src/Foreign/CUDA/Runtime/Texture.chs b/src/Foreign/CUDA/Runtime/Texture.chs deleted file mode 100644 index 94e8db9..0000000 --- a/src/Foreign/CUDA/Runtime/Texture.chs +++ /dev/null @@ -1,203 +0,0 @@ -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE ForeignFunctionInterface #-} --------------------------------------------------------------------------------- --- | --- Module : Foreign.CUDA.Runtime.Texture --- Copyright : [2009..2023] Trevor L. McDonell --- License : BSD --- --- Texture references --- --------------------------------------------------------------------------------- - -module Foreign.CUDA.Runtime.Texture ( - - -- * Texture Reference Management - Texture(..), FormatKind(..), AddressMode(..), FilterMode(..), FormatDesc(..), - bind, bind2D - -) where - --- Friends -import Foreign.CUDA.Ptr -import Foreign.CUDA.Runtime.Error -import Foreign.CUDA.Internal.C2HS - --- System -import Data.Int -import Foreign -import Foreign.C - -#include "cbits/stubs.h" -{# context lib="cudart" #} - -#c -typedef struct textureReference textureReference; -typedef struct cudaChannelFormatDesc cudaChannelFormatDesc; -#endc - --------------------------------------------------------------------------------- --- Data Types --------------------------------------------------------------------------------- - --- |A texture reference --- -{# pointer *textureReference as ^ -> Texture #} - -data Texture = Texture - { - normalised :: !Bool, -- ^ access texture using normalised coordinates [0.0,1.0) - filtering :: !FilterMode, - addressing :: !(AddressMode, AddressMode, AddressMode), - format :: !FormatDesc - } - deriving (Eq, Show) - --- |Texture channel format kind --- -{# enum cudaChannelFormatKind as FormatKind - { } - with prefix="cudaChannelFormatKind" deriving (Eq, Show) #} - --- |Texture addressing mode --- -{# enum cudaTextureAddressMode as AddressMode - { } - with prefix="cudaAddressMode" deriving (Eq, Show) #} - --- |Texture filtering mode --- -{# enum cudaTextureFilterMode as FilterMode - { } - with prefix="cudaFilterMode" deriving (Eq, Show) #} - - --- |A description of how memory read through the texture cache should be --- interpreted, including the kind of data and the number of bits of each --- component (x,y,z and w, respectively). --- -{# pointer *cudaChannelFormatDesc as ^ foreign -> FormatDesc nocode #} - -data FormatDesc = FormatDesc - { - depth :: !(Int,Int,Int,Int), - kind :: !FormatKind - } - deriving (Eq, Show) - -instance Storable FormatDesc where - sizeOf _ = {# sizeof cudaChannelFormatDesc #} - alignment _ = alignment (undefined :: Ptr ()) - - peek p = do - dx <- cIntConv `fmap` {# get cudaChannelFormatDesc.x #} p - dy <- cIntConv `fmap` {# get cudaChannelFormatDesc.y #} p - dz <- cIntConv `fmap` {# get cudaChannelFormatDesc.z #} p - dw <- cIntConv `fmap` {# get cudaChannelFormatDesc.w #} p - df <- cToEnum `fmap` {# get cudaChannelFormatDesc.f #} p - return $ FormatDesc (dx,dy,dz,dw) df - - poke p (FormatDesc (x,y,z,w) k) = do - {# set cudaChannelFormatDesc.x #} p (cIntConv x) - {# set cudaChannelFormatDesc.y #} p (cIntConv y) - {# set cudaChannelFormatDesc.z #} p (cIntConv z) - {# set cudaChannelFormatDesc.w #} p (cIntConv w) - {# set cudaChannelFormatDesc.f #} p (cFromEnum k) - - -instance Storable Texture where - sizeOf _ = {# sizeof textureReference #} - alignment _ = alignment (undefined :: Ptr ()) - - peek p = do - norm <- cToBool `fmap` {# get textureReference.normalized #} p - fmt <- cToEnum `fmap` {# get textureReference.filterMode #} p - dsc <- peek . castPtr =<< {# get textureReference.channelDesc #} p - [x,y,z] <- peekArrayWith cToEnum 3 =<< {# get textureReference.addressMode #} p - return $ Texture norm fmt (x,y,z) dsc - - poke p (Texture norm fmt (x,y,z) dsc) = do - {# set textureReference.normalized #} p (cFromBool norm) - {# set textureReference.filterMode #} p (cFromEnum fmt) - withArray (map cFromEnum [x,y,z]) ({# set textureReference.addressMode #} p) - - -- c2hs is returning the wrong type for structs-within-structs - dscptr <- {# get textureReference.channelDesc #} p - poke (castPtr dscptr) dsc - - --------------------------------------------------------------------------------- --- Texture References --------------------------------------------------------------------------------- - --- |Bind the memory area associated with the device pointer to a texture --- reference given by the named symbol. Any previously bound references are --- unbound. --- -{-# INLINEABLE bind #-} -bind :: String -> Texture -> DevicePtr a -> Int64 -> IO () -bind !name !tex !dptr !bytes = do - ref <- getTex name - poke ref tex - nothingIfOk =<< cudaBindTexture ref dptr (format tex) bytes - -{-# INLINE cudaBindTexture #-} -{# fun unsafe cudaBindTexture - { alloca- `Int' - , id `TextureReference' - , dptr `DevicePtr a' - , with_* `FormatDesc' - , `Int64' } -> `Status' cToEnum #} - where dptr = useDevicePtr . castDevPtr - --- |Bind the two-dimensional memory area to the texture reference associated --- with the given symbol. The size of the area is constrained by (width,height) --- in texel units, and the row pitch in bytes. Any previously bound references --- are unbound. --- -{-# INLINEABLE bind2D #-} -bind2D :: String -> Texture -> DevicePtr a -> (Int,Int) -> Int64 -> IO () -bind2D !name !tex !dptr (!width,!height) !bytes = do - ref <- getTex name - poke ref tex - nothingIfOk =<< cudaBindTexture2D ref dptr (format tex) width height bytes - -{-# INLINE cudaBindTexture2D #-} -{# fun unsafe cudaBindTexture2D - { alloca- `Int' - , id `TextureReference' - , dptr `DevicePtr a' - , with_* `FormatDesc' - , `Int' - , `Int' - , `Int64' } -> `Status' cToEnum #} - where dptr = useDevicePtr . castDevPtr - - --- |Returns the texture reference associated with the given symbol --- -{-# INLINEABLE getTex #-} -getTex :: String -> IO TextureReference -getTex !name = resultIfOk =<< cudaGetTextureReference name - -{-# INLINE cudaGetTextureReference #-} -{# fun unsafe cudaGetTextureReference - { alloca- `Ptr Texture' peek* - , withCString_* `String' } -> `Status' cToEnum #} - - --------------------------------------------------------------------------------- --- Internal --------------------------------------------------------------------------------- - -{-# INLINE with_ #-} -with_ :: Storable a => a -> (Ptr a -> IO b) -> IO b -with_ = with - - --- CUDA 5.0 changed the types of some attributes from char* to void* --- -{-# INLINE withCString_ #-} -withCString_ :: String -> (Ptr a -> IO b) -> IO b -withCString_ !str !fn = withCString str (fn . castPtr) -