Ubuntu Manpage: Memory Management -

Provided by: nvidia-cuda-dev_9.1.85-3ubuntu1_amd64

NAME

       Memory Management -

   Functions
       cudaError_t cudaArrayGetInfo (struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int
           *flags, cudaArray_t array)
           Gets info about the specified cudaArray.
       __cudart_builtin__ cudaError_t cudaFree (void *devPtr)
           Frees memory on the device.
       cudaError_t cudaFreeArray (cudaArray_t array)
           Frees an array on the device.
       cudaError_t cudaFreeHost (void *ptr)
           Frees page-locked memory.
       cudaError_t cudaFreeMipmappedArray (cudaMipmappedArray_t mipmappedArray)
           Frees a mipmapped array on the device.
       cudaError_t cudaGetMipmappedArrayLevel (cudaArray_t *levelArray, cudaMipmappedArray_const_t
           mipmappedArray, unsigned int level)
           Gets a mipmap level of a CUDA mipmapped array.
       cudaError_t cudaGetSymbolAddress (void **devPtr, const void *symbol)
           Finds the address associated with a CUDA symbol.
       cudaError_t cudaGetSymbolSize (size_t *size, const void *symbol)
           Finds the size of the object associated with a CUDA symbol.
       cudaError_t cudaHostAlloc (void **pHost, size_t size, unsigned int flags)
           Allocates page-locked memory on the host.
       cudaError_t cudaHostGetDevicePointer (void **pDevice, void *pHost, unsigned int flags)
           Passes back device pointer of mapped host memory allocated by cudaHostAlloc or registered by
           cudaHostRegister.
       cudaError_t cudaHostGetFlags (unsigned int *pFlags, void *pHost)
           Passes back flags used to allocate pinned host memory allocated by cudaHostAlloc.
       cudaError_t cudaHostRegister (void *ptr, size_t size, unsigned int flags)
           Registers an existing host memory range for use by CUDA.
       cudaError_t cudaHostUnregister (void *ptr)
           Unregisters a memory range that was registered with cudaHostRegister.
       __cudart_builtin__ cudaError_t cudaMalloc (void **devPtr, size_t size)
           Allocate memory on the device.
       cudaError_t cudaMalloc3D (struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent)
           Allocates logical 1D, 2D, or 3D memory objects on the device.
       cudaError_t cudaMalloc3DArray (cudaArray_t *array, const struct cudaChannelFormatDesc *desc, struct
           cudaExtent extent, unsigned int flags=0)
           Allocate an array on the device.
       cudaError_t cudaMallocArray (cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
           size_t height=0, unsigned int flags=0)
           Allocate an array on the device.
       cudaError_t cudaMallocHost (void **ptr, size_t size)
           Allocates page-locked memory on the host.
       __cudart_builtin__ cudaError_t cudaMallocManaged (void **devPtr, size_t size, unsigned int flags=0x01)
           Allocates memory that will be automatically managed by the Unified Memory system.
       cudaError_t cudaMallocMipmappedArray (cudaMipmappedArray_t *mipmappedArray, const struct
           cudaChannelFormatDesc *desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags=0)
           Allocate a mipmapped array on the device.
       cudaError_t cudaMallocPitch (void **devPtr, size_t *pitch, size_t width, size_t height)
           Allocates pitched memory on the device.
       cudaError_t cudaMemAdvise (const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device)
           Advise about the usage of a given memory range.
       cudaError_t cudaMemcpy (void *dst, const void *src, size_t count, enum cudaMemcpyKind kind)
           Copies data between host and device.
       cudaError_t cudaMemcpy2D (void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t
           height, enum cudaMemcpyKind kind)
           Copies data between host and device.
       cudaError_t cudaMemcpy2DArrayToArray (cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
           cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum
           cudaMemcpyKind kind=cudaMemcpyDeviceToDevice)
           Copies data between host and device.
       __cudart_builtin__ cudaError_t cudaMemcpy2DAsync (void *dst, size_t dpitch, const void *src, size_t
           spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data between host and device.
       cudaError_t cudaMemcpy2DFromArray (void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
           size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind)
           Copies data between host and device.
       cudaError_t cudaMemcpy2DFromArrayAsync (void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
           size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data between host and device.
       cudaError_t cudaMemcpy2DToArray (cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t
           spitch, size_t width, size_t height, enum cudaMemcpyKind kind)
           Copies data between host and device.
       cudaError_t cudaMemcpy2DToArrayAsync (cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
           size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data between host and device.
       cudaError_t cudaMemcpy3D (const struct cudaMemcpy3DParms *p)
           Copies data between 3D objects.
       __cudart_builtin__ cudaError_t cudaMemcpy3DAsync (const struct cudaMemcpy3DParms *p, cudaStream_t
           stream=0)
           Copies data between 3D objects.
       cudaError_t cudaMemcpy3DPeer (const struct cudaMemcpy3DPeerParms *p)
           Copies memory between devices.
       cudaError_t cudaMemcpy3DPeerAsync (const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream=0)
           Copies memory between devices asynchronously.
       cudaError_t cudaMemcpyArrayToArray (cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
           cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind
           kind=cudaMemcpyDeviceToDevice)
           Copies data between host and device.
       __cudart_builtin__ cudaError_t cudaMemcpyAsync (void *dst, const void *src, size_t count, enum
           cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data between host and device.
       cudaError_t cudaMemcpyFromArray (void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t
           count, enum cudaMemcpyKind kind)
           Copies data between host and device.
       cudaError_t cudaMemcpyFromArrayAsync (void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset,
           size_t count, enum cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data between host and device.
       cudaError_t cudaMemcpyFromSymbol (void *dst, const void *symbol, size_t count, size_t offset=0, enum
           cudaMemcpyKind kind=cudaMemcpyDeviceToHost)
           Copies data from the given symbol on the device.
       cudaError_t cudaMemcpyFromSymbolAsync (void *dst, const void *symbol, size_t count, size_t offset, enum
           cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data from the given symbol on the device.
       cudaError_t cudaMemcpyPeer (void *dst, int dstDevice, const void *src, int srcDevice, size_t count)
           Copies memory between two devices.
       cudaError_t cudaMemcpyPeerAsync (void *dst, int dstDevice, const void *src, int srcDevice, size_t count,
           cudaStream_t stream=0)
           Copies memory between two devices asynchronously.
       cudaError_t cudaMemcpyToArray (cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t
           count, enum cudaMemcpyKind kind)
           Copies data between host and device.
       cudaError_t cudaMemcpyToArrayAsync (cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
           size_t count, enum cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data between host and device.
       cudaError_t cudaMemcpyToSymbol (const void *symbol, const void *src, size_t count, size_t offset=0, enum
           cudaMemcpyKind kind=cudaMemcpyHostToDevice)
           Copies data to the given symbol on the device.
       cudaError_t cudaMemcpyToSymbolAsync (const void *symbol, const void *src, size_t count, size_t offset,
           enum cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data to the given symbol on the device.
       cudaError_t cudaMemGetInfo (size_t *free, size_t *total)
           Gets free and total device memory.
       cudaError_t cudaMemPrefetchAsync (const void *devPtr, size_t count, int dstDevice, cudaStream_t stream=0)
           Prefetches memory to the specified destination device.
       cudaError_t cudaMemRangeGetAttribute (void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
           const void *devPtr, size_t count)
           Query an attribute of a given memory range.
       cudaError_t cudaMemRangeGetAttributes (void **data, size_t *dataSizes, enum cudaMemRangeAttribute
           *attributes, size_t numAttributes, const void *devPtr, size_t count)
           Query attributes of a given memory range.
       cudaError_t cudaMemset (void *devPtr, int value, size_t count)
           Initializes or sets device memory to a value.
       cudaError_t cudaMemset2D (void *devPtr, size_t pitch, int value, size_t width, size_t height)
           Initializes or sets device memory to a value.
       __cudart_builtin__ cudaError_t cudaMemset2DAsync (void *devPtr, size_t pitch, int value, size_t width,
           size_t height, cudaStream_t stream=0)
           Initializes or sets device memory to a value.
       cudaError_t cudaMemset3D (struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent)
           Initializes or sets device memory to a value.
       __cudart_builtin__ cudaError_t cudaMemset3DAsync (struct cudaPitchedPtr pitchedDevPtr, int value, struct
           cudaExtent extent, cudaStream_t stream=0)
           Initializes or sets device memory to a value.
       __cudart_builtin__ cudaError_t cudaMemsetAsync (void *devPtr, int value, size_t count, cudaStream_t
           stream=0)
           Initializes or sets device memory to a value.
       struct cudaExtent make_cudaExtent (size_t w, size_t h, size_t d)
           Returns a cudaExtent based on input parameters.
       struct cudaPitchedPtr make_cudaPitchedPtr (void *d, size_t p, size_t xsz, size_t ysz)
           Returns a cudaPitchedPtr based on input parameters.
       struct cudaPos make_cudaPos (size_t x, size_t y, size_t z)
           Returns a cudaPos based on input parameters.

Detailed Description

       \brief memory management functions of the CUDA runtime API (cuda_runtime_api.h)

       This section describes the memory management functions of the CUDA runtime application programming
       interface.

       Some functions have overloaded C++ API template versions documented separately in the C++ API Routines
       module.

Function Documentation

   cudaError_t cudaArrayGetInfo (struct cudaChannelFormatDesc * desc, struct cudaExtent * extent, unsigned int *
       flags, cudaArray_t array)
       Returns in *desc, *extent and *flags respectively, the type, shape and flags of array.

       Any of *desc, *extent and *flags may be specified as NULL.

       Parameters:
           desc - Returned array type
           extent - Returned array shape. 2D arrays will have depth of zero
           flags - Returned array flags
           array - The cudaArray to get info for

       Returns:
           cudaSuccess, cudaErrorInvalidValue

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cuArrayGetDescriptor, cuArray3DGetDescriptor

   __cudart_builtin__ cudaError_t cudaFree (void * devPtr)
       Frees the memory space pointed to by devPtr, which must have been returned by a previous call to
       cudaMalloc() or cudaMallocPitch(). Otherwise, or if cudaFree(devPtr) has already been called before, an
       error is returned. If devPtr is 0, no operation is performed. cudaFree() returns
       cudaErrorInvalidDevicePointer in case of failure.

       The device version of cudaFree cannot be used with a *devPtr allocated using the host API, and vice
       versa.

       Parameters:
           devPtr - Device pointer to memory to free

       Returns:
           cudaSuccess, cudaErrorInvalidDevicePointer, cudaErrorInitializationError

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMalloc, cudaMallocPitch, cudaMallocArray, cudaFreeArray, cudaMallocHost (C API), cudaFreeHost,
           cudaMalloc3D, cudaMalloc3DArray, cudaHostAlloc, cuMemFree

   cudaError_t cudaFreeArray (cudaArray_t array)
       Frees the CUDA array array, which must have been * returned by a previous call to cudaMallocArray(). If
       cudaFreeArray(array) has already been called before, cudaErrorInvalidValue is returned. If devPtr is 0,
       no operation is performed.

       Parameters:
           array - Pointer to array to free

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInitializationError

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMalloc, cudaMallocPitch, cudaFree, cudaMallocArray, cudaMallocHost (C API), cudaFreeHost,
           cudaHostAlloc, cuArrayDestroy

   cudaError_t cudaFreeHost (void * ptr)
       Frees the memory space pointed to by hostPtr, which must have been returned by a previous call to
       cudaMallocHost() or cudaHostAlloc().

       Parameters:
           ptr - Pointer to memory to free

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInitializationError

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMalloc, cudaMallocPitch, cudaFree, cudaMallocArray, cudaFreeArray, cudaMallocHost (C API),
           cudaMalloc3D, cudaMalloc3DArray, cudaHostAlloc, cuMemFreeHost

   cudaError_t cudaFreeMipmappedArray (cudaMipmappedArray_t mipmappedArray)
       Frees the CUDA mipmapped array mipmappedArray, which must have been returned by a previous call to
       cudaMallocMipmappedArray(). If cudaFreeMipmappedArray(mipmappedArray) has already been called before,
       cudaErrorInvalidValue is returned.

       Parameters:
           mipmappedArray - Pointer to mipmapped array to free

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInitializationError

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMalloc, cudaMallocPitch, cudaFree, cudaMallocArray, cudaMallocHost (C API), cudaFreeHost,
           cudaHostAlloc, cuMipmappedArrayDestroy

   cudaError_t cudaGetMipmappedArrayLevel (cudaArray_t * levelArray, cudaMipmappedArray_const_t mipmappedArray,
       unsigned int level)
       Returns in *levelArray a CUDA array that represents a single mipmap level of the CUDA mipmapped array
       mipmappedArray.

       If level is greater than the maximum number of levels in this mipmapped array, cudaErrorInvalidValue is
       returned.

       Parameters:
           levelArray - Returned mipmap level CUDA array
           mipmappedArray - CUDA mipmapped array
           level - Mipmap level

       Returns:
           cudaSuccess, cudaErrorInvalidValue

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMalloc3D, cudaMalloc, cudaMallocPitch, cudaFree, cudaFreeArray, cudaMallocHost (C API),
           cudaFreeHost, cudaHostAlloc, make_cudaExtent, cuMipmappedArrayGetLevel

   cudaError_t cudaGetSymbolAddress (void ** devPtr, const void * symbol)
       Returns in *devPtr the address of symbol symbol on the device. symbol is a variable that resides in
       global or constant memory space. If symbol cannot be found, or if symbol is not declared in the global or
       constant memory space, *devPtr is unchanged and the error cudaErrorInvalidSymbol is returned.

       Parameters:
           devPtr - Return device pointer associated with symbol
           symbol - Device symbol address

       Returns:
           cudaSuccess, cudaErrorInvalidSymbol, cudaErrorNoKernelImageForDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           Use of a string naming a variable as the symbol parameter was deprecated in CUDA 4.1 and removed in
           CUDA 5.0.

       See also:
           cudaGetSymbolAddress (C++ API), cudaGetSymbolSize (C API), cuModuleGetGlobal

   cudaError_t cudaGetSymbolSize (size_t * size, const void * symbol)
       Returns in *size the size of symbol symbol. symbol is a variable that resides in global or constant
       memory space. If symbol cannot be found, or if symbol is not declared in global or constant memory space,
       *size is unchanged and the error cudaErrorInvalidSymbol is returned.

       Parameters:
           size - Size of object associated with symbol
           symbol - Device symbol address

       Returns:
           cudaSuccess, cudaErrorInvalidSymbol, cudaErrorNoKernelImageForDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           Use of a string naming a variable as the symbol parameter was deprecated in CUDA 4.1 and removed in
           CUDA 5.0.

       See also:
           cudaGetSymbolAddress (C API), cudaGetSymbolSize (C++ API), cuModuleGetGlobal

   cudaError_t cudaHostAlloc (void ** pHost, size_t size, unsigned int flags)
       Allocates size bytes of host memory that is page-locked and accessible to the device. The driver tracks
       the virtual memory ranges allocated with this function and automatically accelerates calls to functions
       such as cudaMemcpy(). Since the memory can be accessed directly by the device, it can be read or written
       with much higher bandwidth than pageable memory obtained with functions such as malloc(). Allocating
       excessive amounts of pinned memory may degrade system performance, since it reduces the amount of memory
       available to the system for paging. As a result, this function is best used sparingly to allocate staging
       areas for data exchange between host and device.

       The flags parameter enables different options to be specified that affect the allocation, as follows.

       • cudaHostAllocDefault:  This  flag's  value  is  defined  to  be 0 and causes cudaHostAlloc() to emulate
         cudaMallocHost().

       • cudaHostAllocPortable: The memory returned by this call will be considered as pinned memory by all CUDA
         contexts, not just the one that performed the allocation.

       • cudaHostAllocMapped: Maps the allocation into the CUDA address space. The device pointer to the  memory
         may be obtained by calling cudaHostGetDevicePointer().

       • cudaHostAllocWriteCombined:  Allocates  the memory as write-combined (WC). WC memory can be transferred
         across the PCI Express bus more quickly on some system configurations, but cannot be  read  efficiently
         by  most  CPUs.  WC memory is a good option for buffers that will be written by the CPU and read by the
         device via mapped pinned memory or host->device transfers.

       All of these flags are orthogonal to one another: a developer  may  allocate  memory  that  is  portable,
       mapped and/or write-combined with no restrictions.

       In  order  for  the  cudaHostAllocMapped  flag  to  have  any  effect,  the CUDA context must support the
       cudaDeviceMapHost flag, which can be checked via  cudaGetDeviceFlags().  The  cudaDeviceMapHost  flag  is
       implicitly set for contexts created via the runtime API.

       The  cudaHostAllocMapped  flag  may  be specified on CUDA contexts for devices that do not support mapped
       pinned memory. The failure is deferred to cudaHostGetDevicePointer() because the  memory  may  be  mapped
       into other CUDA contexts via the cudaHostAllocPortable flag.

       Memory allocated by this function must be freed with cudaFreeHost().

       Parameters:
           pHost - Device pointer to allocated memory
           size - Requested allocation size in bytes
           flags - Requested properties of allocated memory

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaSetDeviceFlags, cudaMallocHost (C API), cudaFreeHost, cudaGetDeviceFlags, cuMemHostAlloc

   cudaError_t cudaHostGetDevicePointer (void ** pDevice, void * pHost, unsigned int flags)
       Passes   back  the  device  pointer  corresponding  to  the  mapped,  pinned  host  buffer  allocated  by
       cudaHostAlloc() or registered by cudaHostRegister().

       cudaHostGetDevicePointer() will fail if the cudaDeviceMapHost flag  was  not  specified  before  deferred
       context creation occurred, or if called on a device that does not support mapped, pinned memory.

       For      devices      that     have     a     non-zero     value     for     the     device     attribute
       cudaDevAttrCanUseHostPointerForRegisteredMem, the memory can also be accessed from the device  using  the
       host  pointer  pHost.  The device pointer returned by cudaHostGetDevicePointer() may or may not match the
       original host pointer pHost and depends on the devices visible to the application. If all devices visible
       to the application have a non-zero value for  the  device  attribute,  the  device  pointer  returned  by
       cudaHostGetDevicePointer()  will  match  the  original  pointer  pHost.  If  any  device  visible  to the
       application  has  a  zero  value  for  the   device   attribute,   the   device   pointer   returned   by
       cudaHostGetDevicePointer()  will  not  match the original host pointer pHost, but it will be suitable for
       use on all devices provided Unified Virtual Addressing is enabled. In such systems, it is valid to access
       the memory using either pointer on devices that have a non-zero value  for  the  device  attribute.  Note
       however that such devices should access the memory using only of the two pointers and not both.

       flags provides for future releases. For now, it must be set to 0.

       Parameters:
           pDevice - Returned device pointer for mapped memory
           pHost - Requested host pointer mapping
           flags - Flags for extensions (must be 0 for now)

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaSetDeviceFlags, cudaHostAlloc, cuMemHostGetDevicePointer

   cudaError_t cudaHostGetFlags (unsigned int * pFlags, void * pHost)
       cudaHostGetFlags()  will  fail  if  the  input  pointer  does not reside in an address range allocated by
       cudaHostAlloc().

       Parameters:
           pFlags - Returned flags word
           pHost - Host pointer

       Returns:
           cudaSuccess, cudaErrorInvalidValue

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaHostAlloc, cuMemHostGetFlags

   cudaError_t cudaHostRegister (void * ptr, size_t size, unsigned int flags)
       Page-locks the memory range specified by ptr and size and maps it  for  the  device(s)  as  specified  by
       flags. This memory range also is added to the same tracking mechanism as cudaHostAlloc() to automatically
       accelerate  calls  to  functions  such  as cudaMemcpy(). Since the memory can be accessed directly by the
       device, it can be read or written with much higher bandwidth than  pageable  memory  that  has  not  been
       registered. Page-locking excessive amounts of memory may degrade system performance, since it reduces the
       amount of memory available to the system for paging. As a result, this function is best used sparingly to
       register staging areas for data exchange between host and device.

       cudaHostRegister is not supported on non I/O coherent devices.

       The flags parameter enables different options to be specified that affect the allocation, as follows.

       • cudaHostRegisterDefault:  On  a  system with unified virtual addressing, the memory will be both mapped
         and portable. On a system with no unified virtual addressing, the memory will  be  neither  mapped  nor
         portable.

       • cudaHostRegisterPortable:  The  memory returned by this call will be considered as pinned memory by all
         CUDA contexts, not just the one that performed the allocation.

       • cudaHostRegisterMapped: Maps the allocation into the CUDA address space.  The  device  pointer  to  the
         memory may be obtained by calling cudaHostGetDevicePointer().

       • cudaHostRegisterIoMemory:  The  passed  memory pointer is treated as pointing to some memory-mapped I/O
         space, e.g. belonging to a third-party PCIe device, and  it  will  marked  as  non  cache-coherent  and
         contiguous.

       All  of  these  flags are orthogonal to one another: a developer may page-lock memory that is portable or
       mapped with no restrictions.

       The CUDA context must have been created with the cudaMapHost flag in order for the cudaHostRegisterMapped
       flag to have any effect.

       The cudaHostRegisterMapped flag may be specified on CUDA contexts for devices that do not support  mapped
       pinned  memory.  The  failure  is deferred to cudaHostGetDevicePointer() because the memory may be mapped
       into other CUDA contexts via the cudaHostRegisterPortable flag.

       For     devices     that     have     a     non-zero     value     for     the      device      attribute
       cudaDevAttrCanUseHostPointerForRegisteredMem,  the  memory can also be accessed from the device using the
       host pointer ptr. The device pointer returned by cudaHostGetDevicePointer() may  or  may  not  match  the
       original  host  pointer ptr and depends on the devices visible to the application. If all devices visible
       to the application have a non-zero value for  the  device  attribute,  the  device  pointer  returned  by
       cudaHostGetDevicePointer()  will match the original pointer ptr. If any device visible to the application
       has a zero value for the device attribute, the device pointer returned by cudaHostGetDevicePointer() will
       not match the original host pointer ptr, but it will be suitable for use on all devices provided  Unified
       Virtual  Addressing is enabled. In such systems, it is valid to access the memory using either pointer on
       devices that have a non-zero value for the device attribute. Note however that such devices should access
       the memory using only of the two pointers and not both.

       The memory page-locked by this function must be unregistered with cudaHostUnregister().

       Parameters:
           ptr - Host pointer to memory to page-lock
           size - Size in bytes of the address range to page-lock in bytes
           flags - Flags for allocation request

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation,  cudaErrorHostMemoryAlreadyRegistered,
           cudaErrorNotSupported

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaHostUnregister, cudaHostGetFlags, cudaHostGetDevicePointer, cuMemHostRegister

   cudaError_t cudaHostUnregister (void * ptr)
       Unmaps the memory range whose base address is specified by ptr, and makes it pageable again.

       The base address must be the same one specified to cudaHostRegister().

       Parameters:
           ptr - Host pointer to memory to unregister

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorHostMemoryNotRegistered

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaHostUnregister, cuMemHostUnregister

   __cudart_builtin__ cudaError_t cudaMalloc (void ** devPtr, size_t size)
       Allocates  size  bytes  of  linear memory on the device and returns in *devPtr a pointer to the allocated
       memory. The allocated memory is suitably aligned for any kind of variable. The  memory  is  not  cleared.
       cudaMalloc() returns cudaErrorMemoryAllocation in case of failure.

       The  device  version  of  cudaFree  cannot  be used with a *devPtr allocated using the host API, and vice
       versa.

       Parameters:
           devPtr - Pointer to allocated device memory
           size - Requested allocation size in bytes

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       See also:
           cudaMallocPitch,   cudaFree,   cudaMallocArray,   cudaFreeArray,   cudaMalloc3D,   cudaMalloc3DArray,
           cudaMallocHost (C API), cudaFreeHost, cudaHostAlloc, cuMemAlloc

   cudaError_t cudaMalloc3D (struct cudaPitchedPtr * pitchedDevPtr, struct cudaExtent extent)
       Allocates  at  least  width  *  height  *  depth  bytes  of  linear  memory  on  the device and returns a
       cudaPitchedPtr in which ptr is a pointer to the allocated memory. The function may pad the allocation  to
       ensure hardware alignment requirements are met. The pitch returned in the pitch field of pitchedDevPtr is
       the width in bytes of the allocation.

       The  returned  cudaPitchedPtr contains additional fields xsize and ysize, the logical width and height of
       the allocation, which are equivalent to the width and height extent parameters provided by the programmer
       during allocation.

       For allocations of 2D and 3D objects, it is highly recommended that programmers perform allocations using
       cudaMalloc3D() or cudaMallocPitch(). Due to alignment restrictions in the hardware,  this  is  especially
       true  if  the  application  will  be  performing memory copies involving 2D or 3D objects (whether linear
       memory or CUDA arrays).

       Parameters:
           pitchedDevPtr - Pointer to allocated pitched device memory
           extent - Requested allocation size (width field in bytes)

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMallocPitch,   cudaFree,   cudaMemcpy3D,   cudaMemset3D,   cudaMalloc3DArray,    cudaMallocArray,
           cudaFreeArray,    cudaMallocHost   (C   API),   cudaFreeHost,   cudaHostAlloc,   make_cudaPitchedPtr,
           make_cudaExtent, cuMemAllocPitch

   cudaError_t cudaMalloc3DArray (cudaArray_t  *  array,  const  struct  cudaChannelFormatDesc  *  desc,  struct
       cudaExtent extent, unsigned int flags = 0)
       Allocates  a CUDA array according to the cudaChannelFormatDesc structure desc and returns a handle to the
       new CUDA array in *array.

       The cudaChannelFormatDesc is defined as:

           struct cudaChannelFormatDesc {
               int x, y, z, w;
               enum cudaChannelFormatKind f;
           };

        where cudaChannelFormatKind is one  of  cudaChannelFormatKindSigned,  cudaChannelFormatKindUnsigned,  or
       cudaChannelFormatKindFloat.

       cudaMalloc3DArray() can allocate the following:

       • A 1D array is allocated if the height and depth extents are both zero.

       • A 2D array is allocated if only the depth extent is zero.

       • A 3D array is allocated if all three extents are non-zero.

       • A 1D layered CUDA array is allocated if only the height extent is zero and the cudaArrayLayered flag is
         set. Each layer is a 1D array. The number of layers is determined by the depth extent.

       • A 2D layered CUDA array is allocated if all three extents are non-zero and the cudaArrayLayered flag is
         set. Each layer is a 2D array. The number of layers is determined by the depth extent.

       • A  cubemap  CUDA  array is allocated if all three extents are non-zero and the cudaArrayCubemap flag is
         set. Width must be equal to height, and depth must be six. A cubemap is a special type  of  2D  layered
         CUDA  array,  where  the  six  layers represent the six faces of a cube. The order of the six layers in
         memory is the same as that listed in cudaGraphicsCubeFace.

       • A cubemap layered CUDA array is allocated if all three extents are non-zero, and both, cudaArrayCubemap
         and cudaArrayLayered flags are set. Width must be equal to height, and depth must be a multiple of six.
         A cubemap layered CUDA array is a special type of 2D layered CUDA array that consists of  a  collection
         of  cubemaps.  The  first  six  layers represent the first cubemap, the next six layers form the second
         cubemap, and so on.

       The flags parameter enables different options to be specified that affect the allocation, as follows.

       • cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation

       • cudaArrayLayered: Allocates a layered CUDA array, with the depth extent indicating the number of layers

       • cudaArrayCubemap: Allocates a cubemap CUDA array. Width must be equal to height, and depth must be six.
         If the cudaArrayLayered flag is also set, depth must be a multiple of six.

       • cudaArraySurfaceLoadStore: Allocates a CUDA array that could be read from or written to using a surface
         reference.

       • cudaArrayTextureGather: This flag indicates that texture gather operations will  be  performed  on  the
         CUDA array. Texture gather can only be performed on 2D CUDA arrays.

       The width, height and depth extents must meet certain size requirements as listed in the following table.
       All values are specified in elements.

       Note  that  2D CUDA arrays have different size requirements if the cudaArrayTextureGather flag is set. In
       that   case,   the   valid   range   for   (width,   height,   depth)   is    ((1,maxTexture2DGather[0]),
       (1,maxTexture2DGather[1]), 0).

       Parameters:
           array - Pointer to allocated array in device memory
           desc - Requested channel format
           extent - Requested allocation size (width field in elements)
           flags - Flags for extensions

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMalloc3D,   cudaMalloc,   cudaMallocPitch,   cudaFree,  cudaFreeArray,  cudaMallocHost  (C  API),
           cudaFreeHost, cudaHostAlloc, make_cudaExtent, cuArray3DCreate

   cudaError_t cudaMallocArray (cudaArray_t * array, const struct cudaChannelFormatDesc *  desc,  size_t  width,
       size_t height = 0, unsigned int flags = 0)
       Allocates  a CUDA array according to the cudaChannelFormatDesc structure desc and returns a handle to the
       new CUDA array in *array.

       The cudaChannelFormatDesc is defined as:

           struct cudaChannelFormatDesc {
               int x, y, z, w;
           enum cudaChannelFormatKind f;
           };

        where cudaChannelFormatKind is one  of  cudaChannelFormatKindSigned,  cudaChannelFormatKindUnsigned,  or
       cudaChannelFormatKindFloat.

       The flags parameter enables different options to be specified that affect the allocation, as follows.

       • cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation

       • cudaArraySurfaceLoadStore:  Allocates  an  array  that  can  be read from or written to using a surface
         reference

       • cudaArrayTextureGather: This flag indicates that texture gather operations will  be  performed  on  the
         array.

       width and height must meet certain size requirements. See cudaMalloc3DArray() for more details.

       Parameters:
           array - Pointer to allocated array in device memory
           desc - Requested channel format
           width - Requested array allocation width
           height - Requested array allocation height
           flags - Requested properties of allocated array

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMalloc,   cudaMallocPitch,   cudaFree,   cudaFreeArray,  cudaMallocHost  (C  API),  cudaFreeHost,
           cudaMalloc3D, cudaMalloc3DArray, cudaHostAlloc, cuArrayCreate

   cudaError_t cudaMallocHost (void ** ptr, size_t size)
       Allocates size bytes of host memory that is page-locked and accessible to the device. The  driver  tracks
       the  virtual  memory ranges allocated with this function and automatically accelerates calls to functions
       such as cudaMemcpy*(). Since the memory can be accessed directly by the device, it can be read or written
       with much higher bandwidth than pageable memory obtained with  functions  such  as  malloc().  Allocating
       excessive  amounts  of  memory with cudaMallocHost() may degrade system performance, since it reduces the
       amount of memory available to the system for paging. As a result, this function is best used sparingly to
       allocate staging areas for data exchange between host and device.

       Parameters:
           ptr - Pointer to allocated host memory
           size - Requested allocation size in bytes

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMalloc,  cudaMallocPitch,  cudaMallocArray,   cudaMalloc3D,   cudaMalloc3DArray,   cudaHostAlloc,
           cudaFree, cudaFreeArray, cudaMallocHost (C++ API), cudaFreeHost, cudaHostAlloc, cuMemAllocHost

   __cudart_builtin__ cudaError_t cudaMallocManaged (void ** devPtr, size_t size, unsigned int flags = 0x01)
       Allocates  size  bytes  of managed memory on the device and returns in *devPtr a pointer to the allocated
       memory. If the device doesn't support  allocating  managed  memory,  cudaErrorNotSupported  is  returned.
       Support  for  managed  memory  can  be  queried  using the device attribute cudaDevAttrManagedMemory. The
       allocated memory is suitably aligned for any kind of variable. The memory is not cleared. If size  is  0,
       cudaMallocManaged  returns  cudaErrorInvalidValue. The pointer is valid on the CPU and on all GPUs in the
       system that support managed memory. All accesses to this pointer must obey the Unified Memory programming
       model.

       flags  specifies  the  default  stream  association  for  this  allocation.  flags   must   be   one   of
       cudaMemAttachGlobal  or  cudaMemAttachHost.  The  default  value  for  flags  is  cudaMemAttachGlobal. If
       cudaMemAttachGlobal is specified, then this memory is accessible  from  any  stream  on  any  device.  If
       cudaMemAttachHost  is specified, then the allocation should not be accessed from devices that have a zero
       value   for   the   device   attribute   cudaDevAttrConcurrentManagedAccess;   an   explicit   call    to
       cudaStreamAttachMemAsync will be required to enable access on such devices.

       If  the  association  is  later  changed  via  cudaStreamAttachMemAsync  to  a single stream, the default
       association, as specified during cudaMallocManaged, is  restored  when  that  stream  is  destroyed.  For
       __managed__  variables,  the  default  association  is always cudaMemAttachGlobal. Note that destroying a
       stream is an asynchronous operation, and as a result, the change  to  default  association  won't  happen
       until all work in the stream has completed.

       Memory allocated with cudaMallocManaged should be released with cudaFree.

       Device  memory  oversubscription is possible for GPUs that have a non-zero value for the device attribute
       cudaDevAttrConcurrentManagedAccess. Managed memory on such GPUs may be evicted from device memory to host
       memory at any time by the Unified Memory driver in order to make room for other allocations.

       In  a  multi-GPU  system  where  all  GPUs   have   a   non-zero   value   for   the   device   attribute
       cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this API returns and instead
       may  be populated on access. In such systems, managed memory can migrate to any processor's memory at any
       time. The Unified Memory driver will employ heuristics to maintain data locality  and  prevent  excessive
       page faults to the extent possible. The application can also guide the driver about memory usage patterns
       via cudaMemAdvise. The application can also explicitly migrate memory to a desired processor's memory via
       cudaMemPrefetchAsync.

       In   a   multi-GPU   system  where  all  of  the  GPUs  have  a  zero  value  for  the  device  attribute
       cudaDevAttrConcurrentManagedAccess and all the GPUs  have  peer-to-peer  support  with  each  other,  the
       physical  storage  for managed memory is created on the GPU which is active at the time cudaMallocManaged
       is called. All other GPUs will reference the data at reduced bandwidth via peer mappings  over  the  PCIe
       bus. The Unified Memory driver does not migrate memory among such GPUs.

       In a multi-GPU system where not all GPUs have peer-to-peer support with each other and where the value of
       the  device  attribute  cudaDevAttrConcurrentManagedAccess  is  zero  for at least one of those GPUs, the
       location chosen for physical storage of managed memory is system-dependent.

       • On Linux, the location chosen will be device memory as long as the current set of active  contexts  are
         on  devices  that  either  have  peer-to-peer  support with each other or have a non-zero value for the
         device attribute cudaDevAttrConcurrentManagedAccess. If there is an active context on a GPU  that  does
         not  have a non-zero value for that device attribute and it does not have peer-to-peer support with the
         other devices that have active contexts on them, then the location for physical storage will be  'zero-
         copy'  or  host  memory.  Note  that this means that managed memory that is located in device memory is
         migrated to host memory if a new context is created on a GPU that doesn't have a non-zero value for the
         device attribute and does not support peer-to-peer with at least one of the other devices that  has  an
         active  context.  This  in  turn  implies  that context creation may fail if there is insufficient host
         memory to migrate all managed allocations.

       • On Windows, the physical storage is always created  in  'zero-copy'  or  host  memory.  All  GPUs  will
         reference  the  data  at  reduced  bandwidth  over  the  PCIe  bus.  In these circumstances, use of the
         environment variable CUDA_VISIBLE_DEVICES is recommended to restrict CUDA to only use those  GPUs  that
         have  peer-to-peer support. Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-
         zero value to force the driver to always use device memory for physical storage. When this  environment
         variable  is set to a non-zero value, all devices used in that process that support managed memory have
         to be peer-to-peer compatible with each other. The error cudaErrorInvalidDevice will be returned  if  a
         device that supports managed memory is used and it is not peer-to-peer compatible with any of the other
         managed  memory  supporting  devices that were previously used in that process, even if cudaDeviceReset
         has been called on those devices. These environment variables are described  in  the  CUDA  programming
         guide under the 'CUDA environment variables' section.

       Parameters:
           devPtr - Pointer to allocated device memory
           size - Requested allocation size in bytes
           flags - Must be either cudaMemAttachGlobal or cudaMemAttachHost (defaults to cudaMemAttachGlobal)

       Returns:
           cudaSuccess, cudaErrorMemoryAllocation, cudaErrorNotSupported, cudaErrorInvalidValue

       See also:
           cudaMallocPitch,   cudaFree,   cudaMallocArray,   cudaFreeArray,   cudaMalloc3D,   cudaMalloc3DArray,
           cudaMallocHost      (C      API),      cudaFreeHost,      cudaHostAlloc,      cudaDeviceGetAttribute,
           cudaStreamAttachMemAsync, cuMemAllocManaged

       Parameters:
           flags Memory can be accessed by any stream on any device

   cudaError_t     cudaMallocMipmappedArray     (cudaMipmappedArray_t    *    mipmappedArray,    const    struct
       cudaChannelFormatDesc * desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags = 0)
       Allocates a CUDA mipmapped array according to the cudaChannelFormatDesc  structure  desc  and  returns  a
       handle  to  the  new  CUDA  mipmapped  array in *mipmappedArray. numLevels specifies the number of mipmap
       levels to be allocated. This value is  clamped  to  the  range  [1,  1  +  floor(log2(max(width,  height,
       depth)))].

       The cudaChannelFormatDesc is defined as:

           struct cudaChannelFormatDesc {
               int x, y, z, w;
               enum cudaChannelFormatKind f;
           };

        where  cudaChannelFormatKind  is  one  of cudaChannelFormatKindSigned, cudaChannelFormatKindUnsigned, or
       cudaChannelFormatKindFloat.

       cudaMallocMipmappedArray() can allocate the following:

       • A 1D mipmapped array is allocated if the height and depth extents are both zero.

       • A 2D mipmapped array is allocated if only the depth extent is zero.

       • A 3D mipmapped array is allocated if all three extents are non-zero.

       • A 1D  layered  CUDA  mipmapped  array  is  allocated  if  only  the  height  extent  is  zero  and  the
         cudaArrayLayered flag is set. Each layer is a 1D mipmapped array. The number of layers is determined by
         the depth extent.

       • A  2D  layered  CUDA  mipmapped  array  is  allocated  if  all  three  extents  are  non-zero  and  the
         cudaArrayLayered flag is set. Each layer is a 2D mipmapped array. The number of layers is determined by
         the depth extent.

       • A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the  cudaArrayCubemap
         flag  is  set.  Width  must  be  equal to height, and depth must be six. The order of the six layers in
         memory is the same as that listed in cudaGraphicsCubeFace.

       • A cubemap layered CUDA mipmapped array is allocated if  all  three  extents  are  non-zero,  and  both,
         cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be a
         multiple  of six. A cubemap layered CUDA mipmapped array is a special type of 2D layered CUDA mipmapped
         array that consists of a collection of cubemap mipmapped arrays. The first  six  layers  represent  the
         first cubemap mipmapped array, the next six layers form the second cubemap mipmapped array, and so on.

       The flags parameter enables different options to be specified that affect the allocation, as follows.

       • cudaArrayDefault: This flag's value is defined to be 0 and provides default mipmapped array allocation

       • cudaArrayLayered: Allocates a layered CUDA mipmapped array, with the depth extent indicating the number
         of layers

       • cudaArrayCubemap:  Allocates  a  cubemap CUDA mipmapped array. Width must be equal to height, and depth
         must be six. If the cudaArrayLayered flag is also set, depth must be a multiple of six.

       • cudaArraySurfaceLoadStore: This flag indicates that individual mipmap  levels  of  the  CUDA  mipmapped
         array will be read from or written to using a surface reference.

       • cudaArrayTextureGather:  This  flag  indicates  that texture gather operations will be performed on the
         CUDA array. Texture gather can only be performed on 2D CUDA mipmapped arrays, and the gather operations
         are performed only on the most detailed mipmap level.

       The width, height and depth extents must meet certain size requirements as listed in the following table.
       All values are specified in elements.

       Parameters:
           mipmappedArray - Pointer to allocated mipmapped array in device memory
           desc - Requested channel format
           extent - Requested allocation size (width field in elements)
           numLevels - Number of mipmap levels to allocate
           flags - Flags for extensions

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMalloc3D,  cudaMalloc,  cudaMallocPitch,  cudaFree,  cudaFreeArray,   cudaMallocHost   (C   API),
           cudaFreeHost, cudaHostAlloc, make_cudaExtent, cuMipmappedArrayCreate

   cudaError_t cudaMallocPitch (void ** devPtr, size_t * pitch, size_t width, size_t height)
       Allocates  at least width (in bytes) * height bytes of linear memory on the device and returns in *devPtr
       a pointer to the allocated memory. The function may pad  the  allocation  to  ensure  that  corresponding
       pointers  in any given row will continue to meet the alignment requirements for coalescing as the address
       is updated from row to row. The pitch returned in *pitch by cudaMallocPitch() is the width  in  bytes  of
       the allocation. The intended usage of pitch is as a separate parameter of the allocation, used to compute
       addresses  within  the  2D  array. Given the row and column of an array element of type T, the address is
       computed as:

           T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;

       For allocations of 2D arrays, it is recommended that programmers consider  performing  pitch  allocations
       using  cudaMallocPitch(). Due to pitch alignment restrictions in the hardware, this is especially true if
       the application will be performing 2D memory copies between different regions of device  memory  (whether
       linear memory or CUDA arrays).

       Parameters:
           devPtr - Pointer to allocated pitched device memory
           pitch - Pitch for allocation
           width - Requested pitched allocation width (in bytes)
           height - Requested pitched allocation height

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMalloc,   cudaFree,   cudaMallocArray,   cudaFreeArray,  cudaMallocHost  (C  API),  cudaFreeHost,
           cudaMalloc3D, cudaMalloc3DArray, cudaHostAlloc, cuMemAllocPitch

   cudaError_t cudaMemAdvise (const void * devPtr, size_t count, enum cudaMemoryAdvise advice, int device)
       Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr  with
       a  size  of  count  bytes. The start address and end address of the memory range will be rounded down and
       rounded up respectively to be aligned to CPU page size before the advice is  applied.  The  memory  range
       must refer to managed memory allocated via cudaMallocManaged or declared via __managed__ variables.

       The advice parameter can take the following values:

       • cudaMemAdviseSetReadMostly:  This  implies  that  the  data  is  mostly  going to be read from and only
         occasionally written to. Any read accesses from any processor to this region will  create  a  read-only
         copy  of  at least the accessed pages in that processor's memory. Additionally, if cudaMemPrefetchAsync
         is called on this region, it will create a read-only copy of the data on the destination processor.  If
         any  processor  writes  to this region, all copies of the corresponding page will be invalidated except
         for the one where the write occurred. The device argument is ignored for this advice. Note that  for  a
         page to be read-duplicated, the accessing processor must either be the CPU or a GPU that has a non-zero
         value  for  the device attribute cudaDevAttrConcurrentManagedAccess. Also, if a context is created on a
         device that does not have the  device  attribute  cudaDevAttrConcurrentManagedAccess  set,  then  read-
         duplication will not occur until all such contexts are destroyed.

       • cudaMemAdviceUnsetReadMostly:  Undoes  the  effect  of  cudaMemAdviceReadMostly  and  also prevents the
         Unified Memory driver from attempting  heuristic  read-duplication  on  the  memory  range.  Any  read-
         duplicated copies of the data will be collapsed into a single copy. The location for the collapsed copy
         will  be  the  preferred  location  if the page has a preferred location and one of the read-duplicated
         copies was resident at that location. Otherwise, the location chosen is arbitrary.

       • cudaMemAdviseSetPreferredLocation: This advice sets the preferred location  for  the  data  to  be  the
         memory  belonging  to device. Passing in cudaCpuDeviceId for device sets the preferred location as host
         memory.  If  device  is  a  GPU,  then  it  must  have  a  non-zero  value  for  the  device  attribute
         cudaDevAttrConcurrentManagedAccess.  Setting  the  preferred location does not cause data to migrate to
         that location immediately. Instead, it guides the migration policy when a fault occurs on  that  memory
         region.  If  the  data  is already in its preferred location and the faulting processor can establish a
         mapping without requiring the data to be migrated, then data migration will be avoided.  On  the  other
         hand,  if  the data is not in its preferred location or if a direct mapping cannot be established, then
         it will be migrated to the processor accessing it. It is important to note that setting  the  preferred
         location does not prevent data prefetching done using cudaMemPrefetchAsync. Having a preferred location
         can  override the page thrash detection and resolution logic in the Unified Memory driver. Normally, if
         a page is detected to be constantly thrashing between for example host and device memory, the page  may
         eventually  be pinned to host memory by the Unified Memory driver. But if the preferred location is set
         as device memory, then the page will continue to thrash indefinitely. If cudaMemAdviseSetReadMostly  is
         also  set on this memory region or any subset of it, then the policies associated with that advice will
         override the policies of this advice.

       • cudaMemAdviseUnsetPreferredLocation: Undoes the effect of cudaMemAdviseSetPreferredLocation and changes
         the preferred location to none.

       • cudaMemAdviseSetAccessedBy: This advice implies that the data will be accessed by  device.  Passing  in
         cudaCpuDeviceId  for  device  will  set  the  advice  for  the CPU. If device is a GPU, then the device
         attribute cudaDevAttrConcurrentManagedAccess  must  be  non-zero.  This  advice  does  not  cause  data
         migration  and  has no impact on the location of the data per se. Instead, it causes the data to always
         be mapped in the specified processor's page tables, as long as the  location  of  the  data  permits  a
         mapping  to  be  established.  If  the  data  gets  migrated  for  any reason, the mappings are updated
         accordingly. This advice is recommended in scenarios where data locality is not important, but avoiding
         faults is. Consider for example a system containing multiple GPUs  with  peer-to-peer  access  enabled,
         where  the  data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating
         data over to the other GPUs is not as important because the accesses are infrequent and the overhead of
         migration may be too high. But preventing faults can still help improve performance, and  so  having  a
         mapping  set up in advance is useful. Note that on CPU access of this data, the data may be migrated to
         host memory because the CPU typically cannot access device  memory  directly.  Any  GPU  that  had  the
         cudaMemAdviceSetAccessedBy  flag  set  for  this data will now have its mapping updated to point to the
         page in host memory. If cudaMemAdviseSetReadMostly is also set on this memory region or any  subset  of
         it,  then  the  policies  associated  with  that  advice  will  override  the  policies of this advice.
         Additionally, if the preferred location of this memory region or any subset of it is also device,  then
         the  policies  associated  with  cudaMemAdviseSetPreferredLocation  will  override the policies of this
         advice.

       • cudaMemAdviseUnsetAccessedBy: Undoes the effect of cudaMemAdviseSetAccessedBy. Any mappings to the data
         from device may be removed at any time causing accesses to result in non-fatal page faults.

       Parameters:
           devPtr - Pointer to memory to set the advice for
           count - Size in bytes of the memory range
           advice - Advice to be applied for the specified memory range
           device - Device to apply the advice for

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpyPeer, cudaMemcpyAsync, cudaMemcpy3DPeerAsync, cudaMemPrefetchAsync, cuMemAdvise

   cudaError_t cudaMemcpy (void * dst, const void * src, size_t count, enum cudaMemcpyKind kind)
       Copies count bytes from the memory area pointed to by src to the memory area pointed  to  by  dst,  where
       kind   specifies   the   direction   of   the   copy,   and   must   be   one   of  cudaMemcpyHostToHost,
       cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or  cudaMemcpyDefault.  Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the pointer values.
       However,  cudaMemcpyDefault  is  only allowed on systems that support unified virtual addressing. Calling
       cudaMemcpy() with dst and src pointers that do not  match  the  direction  of  the  copy  results  in  an
       undefined behavior.

       Parameters:
           dst - Destination memory address
           src - Source memory address
           count - Size in bytes to copy
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy2D,  cudaMemcpyToArray,  cudaMemcpy2DToArray,  cudaMemcpyFromArray,  cudaMemcpy2DFromArray,
           cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,     cudaMemcpyFromSymbol,
           cudaMemcpyAsync,       cudaMemcpy2DAsync,      cudaMemcpyToArrayAsync,      cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,            cudaMemcpy2DFromArrayAsync,             cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpyDtoH, cuMemcpyHtoD, cuMemcpyDtoD, cuMemcpy

   cudaError_t  cudaMemcpy2D  (void  * dst, size_t dpitch, const void * src, size_t spitch, size_t width, size_t
       height, enum cudaMemcpyKind kind)
       Copies a matrix (height rows of width bytes each) from the memory area pointed to by src  to  the  memory
       area  pointed  to  by  dst,  where  kind  specifies  the  direction  of  the  copy,  and  must  be one of
       cudaMemcpyHostToHost,  cudaMemcpyHostToDevice,   cudaMemcpyDeviceToHost,   cudaMemcpyDeviceToDevice,   or
       cudaMemcpyDefault.  Passing  cudaMemcpyDefault  is  recommended,  in  which  case the type of transfer is
       inferred from the pointer values. However, cudaMemcpyDefault is only  allowed  on  systems  that  support
       unified  virtual addressing. dpitch and spitch are the widths in memory in bytes of the 2D arrays pointed
       to by dst and src, including any padding added to the end of each row. The memory areas may not  overlap.
       width  must  not exceed either dpitch or spitch. Calling cudaMemcpy2D() with dst and src pointers that do
       not match the direction of the copy results in an undefined behavior. cudaMemcpy2D() returns an error  if
       dpitch or spitch exceeds the maximum allowed.

       Parameters:
           dst - Destination memory address
           dpitch - Pitch of destination memory
           src - Source memory address
           spitch - Pitch of source memory
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMemcpy,   cudaMemcpyToArray,   cudaMemcpy2DToArray,  cudaMemcpyFromArray,  cudaMemcpy2DFromArray,
           cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,     cudaMemcpyFromSymbol,
           cudaMemcpyAsync,       cudaMemcpy2DAsync,      cudaMemcpyToArrayAsync,      cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,            cudaMemcpy2DFromArrayAsync,             cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpy2D, cuMemcpy2DUnaligned

   cudaError_t    cudaMemcpy2DArrayToArray    (cudaArray_t    dst,   size_t   wOffsetDst,   size_t   hOffsetDst,
       cudaArray_const_t  src,  size_t  wOffsetSrc,  size_t  hOffsetSrc,  size_t  width,  size_t  height,   enum
       cudaMemcpyKind kind = cudaMemcpyDeviceToDevice)
       Copies a matrix (height rows of width bytes each) from the CUDA array srcArray starting at the upper left
       corner  (wOffsetSrc,  hOffsetSrc)  to  the  CUDA array dst starting at the upper left corner (wOffsetDst,
       hOffsetDst), where kind specifies the direction of the copy, and must  be  one  of  cudaMemcpyHostToHost,
       cudaMemcpyHostToDevice,  cudaMemcpyDeviceToHost,  cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the pointer values.
       However, cudaMemcpyDefault is only allowed on systems that support unified virtual addressing. wOffsetDst
       + width must not exceed the width of the CUDA array dst. wOffsetSrc + width must not exceed the width  of
       the CUDA array src.

       Parameters:
           dst - Destination memory address
           wOffsetDst - Destination starting X offset
           hOffsetDst - Destination starting Y offset
           src - Source memory address
           wOffsetSrc - Source starting X offset
           hOffsetSrc - Source starting Y offset
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy,     cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,    cudaMemcpyFromArray,
           cudaMemcpy2DFromArray,     cudaMemcpyArrayToArray,     cudaMemcpyToSymbol,      cudaMemcpyFromSymbol,
           cudaMemcpyAsync,       cudaMemcpy2DAsync,      cudaMemcpyToArrayAsync,      cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,            cudaMemcpy2DFromArrayAsync,             cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpy2D, cuMemcpy2DUnaligned

   __cudart_builtin__ cudaError_t cudaMemcpy2DAsync (void * dst, size_t dpitch, const void * src, size_t spitch,
       size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies  a  matrix  (height rows of width bytes each) from the memory area pointed to by src to the memory
       area pointed to  by  dst,  where  kind  specifies  the  direction  of  the  copy,  and  must  be  one  of
       cudaMemcpyHostToHost,   cudaMemcpyHostToDevice,   cudaMemcpyDeviceToHost,   cudaMemcpyDeviceToDevice,  or
       cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in  which  case  the  type  of  transfer  is
       inferred  from  the  pointer  values.  However, cudaMemcpyDefault is only allowed on systems that support
       unified virtual addressing. dpitch and spitch are the widths in memory in bytes of the 2D arrays  pointed
       to  by dst and src, including any padding added to the end of each row. The memory areas may not overlap.
       width must not exceed either dpitch or spitch.

       Calling cudaMemcpy2DAsync() with dst and src pointers that do not match the direction of the copy results
       in an undefined behavior. cudaMemcpy2DAsync() returns an error if dpitch or spitch is  greater  than  the
       maximum allowed.

       cudaMemcpy2DAsync()  is  asynchronous with respect to the host, so the call may return before the copy is
       complete. The copy can optionally be associated to a stream by passing a  non-zero  stream  argument.  If
       kind  is  cudaMemcpyHostToDevice  or  cudaMemcpyDeviceToHost and stream is non-zero, the copy may overlap
       with operations in other streams.

       The device version of this function only handles device to device copies and cannot  be  given  local  or
       shared pointers.

       Parameters:
           dst - Destination memory address
           dpitch - Pitch of destination memory
           src - Source memory address
           spitch - Pitch of source memory
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy,     cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,    cudaMemcpyFromArray,
           cudaMemcpy2DFromArray,    cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,
           cudaMemcpyFromSymbol,      cudaMemcpyAsync,     cudaMemcpyToArrayAsync,     cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,            cudaMemcpy2DFromArrayAsync,             cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpy2DAsync

   cudaError_t  cudaMemcpy2DFromArray  (void * dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t
       hOffset, size_t width, size_t height, enum cudaMemcpyKind kind)
       Copies a matrix (height rows of width bytes each) from the CUDA array srcArray starting at the upper left
       corner (wOffset, hOffset) to the memory area pointed to by dst, where kind specifies the direction of the
       copy,  and  must  be  one  of   cudaMemcpyHostToHost,   cudaMemcpyHostToDevice,   cudaMemcpyDeviceToHost,
       cudaMemcpyDeviceToDevice,  or  cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in which case
       the type of transfer is inferred from the pointer values. However, cudaMemcpyDefault is only  allowed  on
       systems  that  support unified virtual addressing. dpitch is the width in memory in bytes of the 2D array
       pointed to by dst, including any padding added to the end of each row. wOffset + width  must  not  exceed
       the  width  of the CUDA array src. width must not exceed dpitch. cudaMemcpy2DFromArray() returns an error
       if dpitch exceeds the maximum allowed.

       Parameters:
           dst - Destination memory address
           dpitch - Pitch of destination memory
           src - Source memory address
           wOffset - Source starting X offset
           hOffset - Source starting Y offset
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy,    cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,     cudaMemcpyFromArray,
           cudaMemcpyArrayToArray,     cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,    cudaMemcpyFromSymbol,
           cudaMemcpyAsync,      cudaMemcpy2DAsync,      cudaMemcpyToArrayAsync,       cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,             cudaMemcpy2DFromArrayAsync,            cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpy2D, cuMemcpy2DUnaligned

   cudaError_t cudaMemcpy2DFromArrayAsync (void * dst, size_t dpitch,  cudaArray_const_t  src,  size_t  wOffset,
       size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies a matrix (height rows of width bytes each) from the CUDA array srcArray starting at the upper left
       corner (wOffset, hOffset) to the memory area pointed to by dst, where kind specifies the direction of the
       copy,   and   must   be  one  of  cudaMemcpyHostToHost,  cudaMemcpyHostToDevice,  cudaMemcpyDeviceToHost,
       cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in  which  case
       the  type  of transfer is inferred from the pointer values. However, cudaMemcpyDefault is only allowed on
       systems that support unified virtual addressing. dpitch is the width in memory in bytes of the  2D  array
       pointed  to  by  dst, including any padding added to the end of each row. wOffset + width must not exceed
       the width of the CUDA array src. width must not exceed dpitch.  cudaMemcpy2DFromArrayAsync()  returns  an
       error if dpitch exceeds the maximum allowed.

       cudaMemcpy2DFromArrayAsync()  is asynchronous with respect to the host, so the call may return before the
       copy is complete. The copy can optionally be  associated  to  a  stream  by  passing  a  non-zero  stream
       argument.  If  kind  is cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost and stream is non-zero, the copy
       may overlap with operations in other streams.

       Parameters:
           dst - Destination memory address
           dpitch - Pitch of destination memory
           src - Source memory address
           wOffset - Source starting X offset
           hOffset - Source starting Y offset
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy,    cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,     cudaMemcpyFromArray,
           cudaMemcpy2DFromArray,    cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,
           cudaMemcpyFromSymbol,       cudaMemcpyAsync,        cudaMemcpy2DAsync,        cudaMemcpyToArrayAsync,
           cudaMemcpy2DToArrayAsync,              cudaMemcpyFromArrayAsync,             cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpy2DAsync

   cudaError_t cudaMemcpy2DToArray (cudaArray_t dst, size_t wOffset, size_t hOffset, const void  *  src,  size_t
       spitch, size_t width, size_t height, enum cudaMemcpyKind kind)
       Copies  a  matrix  (height  rows  of width bytes each) from the memory area pointed to by src to the CUDA
       array dst starting at the upper left corner (wOffset, hOffset) where kind specifies the direction of  the
       copy,   and   must   be  one  of  cudaMemcpyHostToHost,  cudaMemcpyHostToDevice,  cudaMemcpyDeviceToHost,
       cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in  which  case
       the  type  of transfer is inferred from the pointer values. However, cudaMemcpyDefault is only allowed on
       systems that support unified virtual addressing. spitch is the width in memory in bytes of the  2D  array
       pointed  to  by  src, including any padding added to the end of each row. wOffset + width must not exceed
       the width of the CUDA array dst. width must not exceed spitch. cudaMemcpy2DToArray() returns an error  if
       spitch exceeds the maximum allowed.

       Parameters:
           dst - Destination memory address
           wOffset - Destination starting X offset
           hOffset - Destination starting Y offset
           src - Source memory address
           spitch - Pitch of source memory
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy,     cudaMemcpy2D,    cudaMemcpyToArray,    cudaMemcpyFromArray,    cudaMemcpy2DFromArray,
           cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,     cudaMemcpyFromSymbol,
           cudaMemcpyAsync,       cudaMemcpy2DAsync,      cudaMemcpyToArrayAsync,      cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,            cudaMemcpy2DFromArrayAsync,             cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpy2D, cuMemcpy2DUnaligned

   cudaError_t  cudaMemcpy2DToArrayAsync  (cudaArray_t  dst,  size_t  wOffset, size_t hOffset, const void * src,
       size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies a matrix (height rows of width bytes each) from the memory area pointed to  by  src  to  the  CUDA
       array  dst starting at the upper left corner (wOffset, hOffset) where kind specifies the direction of the
       copy,  and  must  be  one  of   cudaMemcpyHostToHost,   cudaMemcpyHostToDevice,   cudaMemcpyDeviceToHost,
       cudaMemcpyDeviceToDevice,  or  cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in which case
       the type of transfer is inferred from the pointer values. However, cudaMemcpyDefault is only  allowed  on
       systems  that  support unified virtual addressing. spitch is the width in memory in bytes of the 2D array
       pointed to by src, including any padding added to the end of each row. wOffset + width  must  not  exceed
       the  width  of  the  CUDA  array dst. width must not exceed spitch. cudaMemcpy2DToArrayAsync() returns an
       error if spitch exceeds the maximum allowed.

       cudaMemcpy2DToArrayAsync() is asynchronous with respect to the host, so the call may  return  before  the
       copy  is  complete.  The  copy  can  optionally  be  associated  to a stream by passing a non-zero stream
       argument. If kind is cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost and stream is  non-zero,  the  copy
       may overlap with operations in other streams.

       Parameters:
           dst - Destination memory address
           wOffset - Destination starting X offset
           hOffset - Destination starting Y offset
           src - Source memory address
           spitch - Pitch of source memory
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy,     cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,    cudaMemcpyFromArray,
           cudaMemcpy2DFromArray,    cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,
           cudaMemcpyFromSymbol,        cudaMemcpyAsync,        cudaMemcpy2DAsync,       cudaMemcpyToArrayAsync,
           cudaMemcpyFromArrayAsync,            cudaMemcpy2DFromArrayAsync,             cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpy2DAsync

   cudaError_t cudaMemcpy3D (const struct cudaMemcpy3DParms * p)
       struct cudaExtent {
         size_t width;
         size_t height;
         size_t depth;
       };
       struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);

       struct cudaPos {
         size_t x;
         size_t y;
         size_t z;
       };
       struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);

       struct cudaMemcpy3DParms {
         cudaArray_t           srcArray;
         struct cudaPos        srcPos;
         struct cudaPitchedPtr srcPtr;
         cudaArray_t           dstArray;
         struct cudaPos        dstPos;
         struct cudaPitchedPtr dstPtr;
         struct cudaExtent     extent;
         enum cudaMemcpyKind   kind;
       };

       cudaMemcpy3D()  copies  data  between two 3D objects. The source and destination objects may be in either
       host memory, device memory, or a CUDA array. The source, destination, extent, and kind of copy  performed
       is specified by the cudaMemcpy3DParms struct which should be initialized to zero before use:

       cudaMemcpy3DParms myParms = {0};

       The struct passed to cudaMemcpy3D() must specify one of srcArray or srcPtr and one of dstArray or dstPtr.
       Passing more than one non-zero source or destination will cause cudaMemcpy3D() to return an error.

       The srcPos and dstPos fields are optional offsets into the source and destination objects and are defined
       in  units  of  each object's elements. The element for a host or device pointer is assumed to be unsigned
       char.

       The extent field defines the dimensions of  the  transferred  area  in  elements.  If  a  CUDA  array  is
       participating  in  the copy, the extent is defined in terms of that array's elements. If no CUDA array is
       participating in the copy then the extents are defined in elements of unsigned char.

       The  kind  field  defines  the  direction  of  the  copy.  It  must  be  one   of   cudaMemcpyHostToHost,
       cudaMemcpyHostToDevice,  cudaMemcpyDeviceToHost,  cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the pointer values.
       However, cudaMemcpyDefault is only allowed on  systems  that  support  unified  virtual  addressing.  For
       cudaMemcpyHostToHost  or  cudaMemcpyHostToDevice  or  cudaMemcpyDeviceToHost passed as kind and cudaArray
       type passed as source or destination, if the kind implies cudaArray type  to  be  present  on  the  host,
       cudaMemcpy3D()  will  disregard  that  implication  and  silently correct the kind based on the fact that
       cudaArray type can only be present on the device.

       If the source and destination are both arrays, cudaMemcpy3D() will return an error if they  do  not  have
       the same element size.

       The  source  and  destination  object  may not overlap. If overlapping source and destination objects are
       specified, undefined behavior will result.

       The source object must lie entirely within the region defined  by  srcPos  and  extent.  The  destination
       object must lie entirely within the region defined by dstPos and extent.

       cudaMemcpy3D()  returns  an error if the pitch of srcPtr or dstPtr exceeds the maximum allowed. The pitch
       of a cudaPitchedPtr allocated with cudaMalloc3D() will always be valid.

       Parameters:
           p - 3D memory copy parameters

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMalloc3D,   cudaMalloc3DArray,   cudaMemset3D,   cudaMemcpy3DAsync,   cudaMemcpy,   cudaMemcpy2D,
           cudaMemcpyToArray,       cudaMemcpy2DToArray,       cudaMemcpyFromArray,       cudaMemcpy2DFromArray,
           cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,     cudaMemcpyFromSymbol,
           cudaMemcpyAsync,       cudaMemcpy2DAsync,      cudaMemcpyToArrayAsync,      cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,            cudaMemcpy2DFromArrayAsync,             cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, make_cudaExtent, make_cudaPos, cuMemcpy3D

   __cudart_builtin__  cudaError_t  cudaMemcpy3DAsync (const struct cudaMemcpy3DParms * p, cudaStream_t stream =
       0)
       struct cudaExtent {
         size_t width;
         size_t height;
         size_t depth;
       };
       struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);

       struct cudaPos {
         size_t x;
         size_t y;
         size_t z;
       };
       struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);

       struct cudaMemcpy3DParms {
         cudaArray_t           srcArray;
         struct cudaPos        srcPos;
         struct cudaPitchedPtr srcPtr;
         cudaArray_t           dstArray;
         struct cudaPos        dstPos;
         struct cudaPitchedPtr dstPtr;
         struct cudaExtent     extent;
         enum cudaMemcpyKind   kind;
       };

       cudaMemcpy3DAsync() copies data between two 3D objects. The source and  destination  objects  may  be  in
       either  host  memory,  device  memory, or a CUDA array. The source, destination, extent, and kind of copy
       performed is specified by the cudaMemcpy3DParms struct which should be initialized to zero before use:

       cudaMemcpy3DParms myParms = {0};

       The struct passed to cudaMemcpy3DAsync() must specify one of srcArray or srcPtr and one  of  dstArray  or
       dstPtr.  Passing more than one non-zero source or destination will cause cudaMemcpy3DAsync() to return an
       error.

       The srcPos and dstPos fields are optional offsets into the source and destination objects and are defined
       in units of each object's elements. The element for a host or device pointer is assumed  to  be  unsigned
       char. For CUDA arrays, positions must be in the range [0, 2048) for any dimension.

       The  extent  field  defines  the  dimensions  of  the  transferred  area  in elements. If a CUDA array is
       participating in the copy, the extent is defined in terms of that array's elements. If no CUDA  array  is
       participating in the copy then the extents are defined in elements of unsigned char.

       The   kind   field  defines  the  direction  of  the  copy.  It  must  be  one  of  cudaMemcpyHostToHost,
       cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or  cudaMemcpyDefault.  Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the pointer values.
       However,  cudaMemcpyDefault  is  only  allowed  on  systems  that support unified virtual addressing. For
       cudaMemcpyHostToHost or cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost passed  as  kind  and  cudaArray
       type  passed  as  source  or  destination,  if the kind implies cudaArray type to be present on the host,
       cudaMemcpy3DAsync() will disregard that implication and silently correct the kind based on the fact  that
       cudaArray type can only be present on the device.

       If  the  source  and destination are both arrays, cudaMemcpy3DAsync() will return an error if they do not
       have the same element size.

       The source and destination object may not overlap. If overlapping  source  and  destination  objects  are
       specified, undefined behavior will result.

       The  source  object  must  lie  entirely  within the region defined by srcPos and extent. The destination
       object must lie entirely within the region defined by dstPos and extent.

       cudaMemcpy3DAsync() returns an error if the pitch of srcPtr or dstPtr exceeds the  maximum  allowed.  The
       pitch of a cudaPitchedPtr allocated with cudaMalloc3D() will always be valid.

       cudaMemcpy3DAsync()  is  asynchronous with respect to the host, so the call may return before the copy is
       complete. The copy can optionally be associated to a stream by passing a  non-zero  stream  argument.  If
       kind  is  cudaMemcpyHostToDevice  or  cudaMemcpyDeviceToHost and stream is non-zero, the copy may overlap
       with operations in other streams.

       The device version of this function only handles device to device copies and cannot  be  given  local  or
       shared pointers.

       Parameters:
           p - 3D memory copy parameters
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMalloc3D,    cudaMalloc3DArray,    cudaMemset3D,    cudaMemcpy3D,    cudaMemcpy,    cudaMemcpy2D,
           cudaMemcpyToArray,       cudaMemcpy2DToArray,       cudaMemcpyFromArray,       cudaMemcpy2DFromArray,
           cudaMemcpyArrayToArray,     cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,    cudaMemcpyFromSymbol,
           cudaMemcpyAsync,      cudaMemcpy2DAsync,      cudaMemcpyToArrayAsync,       cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,             cudaMemcpy2DFromArrayAsync,            cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, make_cudaExtent, make_cudaPos, cuMemcpy3DAsync

   cudaError_t cudaMemcpy3DPeer (const struct cudaMemcpy3DPeerParms * p)
       Perform a 3D memory copy according  to  the  parameters  specified  in  p.  See  the  definition  of  the
       cudaMemcpy3DPeerParms structure for documentation of its parameters.

       Note  that this function is synchronous with respect to the host only if the source or destination of the
       transfer is host memory. Note also that this copy is serialized with respect to all  pending  and  future
       asynchronous  work  in to the current device, the copy's source device, and the copy's destination device
       (use cudaMemcpy3DPeerAsync to avoid this synchronization).

       Parameters:
           p - Parameters for the memory copy

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy,    cudaMemcpyPeer,    cudaMemcpyAsync,    cudaMemcpyPeerAsync,     cudaMemcpy3DPeerAsync,
           cuMemcpy3DPeer

   cudaError_t cudaMemcpy3DPeerAsync (const struct cudaMemcpy3DPeerParms * p, cudaStream_t stream = 0)
       Perform  a  3D  memory  copy  according  to  the  parameters  specified  in  p. See the definition of the
       cudaMemcpy3DPeerParms structure for documentation of its parameters.

       Parameters:
           p - Parameters for the memory copy
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy,    cudaMemcpyPeer,    cudaMemcpyAsync,    cudaMemcpyPeerAsync,     cudaMemcpy3DPeerAsync,
           cuMemcpy3DPeerAsync

   cudaError_t  cudaMemcpyArrayToArray (cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t
       src,   size_t   wOffsetSrc,   size_t   hOffsetSrc,   size_t   count,   enum   cudaMemcpyKind    kind    =
       cudaMemcpyDeviceToDevice)
       Copies  count bytes from the CUDA array src starting at the upper left corner (wOffsetSrc, hOffsetSrc) to
       the CUDA array dst starting at the upper left corner (wOffsetDst, hOffsetDst) where  kind  specifies  the
       direction   of   the   copy,   and   must   be   one   of  cudaMemcpyHostToHost,  cudaMemcpyHostToDevice,
       cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice,  or  cudaMemcpyDefault.  Passing  cudaMemcpyDefault  is
       recommended,  in  which  case  the  type  of  transfer  is  inferred  from  the  pointer values. However,
       cudaMemcpyDefault is only allowed on systems that support unified virtual addressing.

       Parameters:
           dst - Destination memory address
           wOffsetDst - Destination starting X offset
           hOffsetDst - Destination starting Y offset
           src - Source memory address
           wOffsetSrc - Source starting X offset
           hOffsetSrc - Source starting Y offset
           count - Size in bytes to copy
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMemcpy,    cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,     cudaMemcpyFromArray,
           cudaMemcpy2DFromArray,     cudaMemcpy2DArrayToArray,     cudaMemcpyToSymbol,    cudaMemcpyFromSymbol,
           cudaMemcpyAsync,      cudaMemcpy2DAsync,      cudaMemcpyToArrayAsync,       cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,             cudaMemcpy2DFromArrayAsync,            cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpyAtoA

   __cudart_builtin__  cudaError_t  cudaMemcpyAsync  (void  *  dst,  const  void  *  src,  size_t  count,   enum
       cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies  count  bytes  from  the memory area pointed to by src to the memory area pointed to by dst, where
       kind  specifies   the   direction   of   the   copy,   and   must   be   one   of   cudaMemcpyHostToHost,
       cudaMemcpyHostToDevice,  cudaMemcpyDeviceToHost,  cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the pointer values.
       However, cudaMemcpyDefault is only allowed on systems that support unified virtual addressing.

       The memory areas may not overlap. Calling cudaMemcpyAsync() with dst and src pointers that do  not  match
       the direction of the copy results in an undefined behavior.

       cudaMemcpyAsync()  is  asynchronous  with  respect to the host, so the call may return before the copy is
       complete. The copy can optionally be associated to a stream by passing a  non-zero  stream  argument.  If
       kind is cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost and the stream is non-zero, the copy may overlap
       with operations in other streams.

       The  device  version  of  this function only handles device to device copies and cannot be given local or
       shared pointers.

       Parameters:
           dst - Destination memory address
           src - Source memory address
           count - Size in bytes to copy
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy,    cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,     cudaMemcpyFromArray,
           cudaMemcpy2DFromArray,    cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,
           cudaMemcpyFromSymbol,    cudaMemcpy2DAsync,     cudaMemcpyToArrayAsync,     cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,             cudaMemcpy2DFromArrayAsync,            cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync cuMemcpyAsync, cuMemcpyDtoHAsync, cuMemcpyHtoDAsync, cuMemcpyDtoDAsync

   cudaError_t cudaMemcpyFromArray (void * dst, cudaArray_const_t src, size_t wOffset,  size_t  hOffset,  size_t
       count, enum cudaMemcpyKind kind)
       Copies  count  bytes  from the CUDA array src starting at the upper left corner (wOffset, hOffset) to the
       memory area pointed to by dst, where kind specifies the direction  of  the  copy,  and  must  be  one  of
       cudaMemcpyHostToHost,   cudaMemcpyHostToDevice,   cudaMemcpyDeviceToHost,   cudaMemcpyDeviceToDevice,  or
       cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in  which  case  the  type  of  transfer  is
       inferred  from  the  pointer  values.  However, cudaMemcpyDefault is only allowed on systems that support
       unified virtual addressing.

       Parameters:
           dst - Destination memory address
           src - Source memory address
           wOffset - Source starting X offset
           hOffset - Source starting Y offset
           count - Size in bytes to copy
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy,    cudaMemcpy2D,    cudaMemcpyToArray,    cudaMemcpy2DToArray,     cudaMemcpy2DFromArray,
           cudaMemcpyArrayToArray,     cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,    cudaMemcpyFromSymbol,
           cudaMemcpyAsync,      cudaMemcpy2DAsync,      cudaMemcpyToArrayAsync,       cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,             cudaMemcpy2DFromArrayAsync,            cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpyAtoH, cuMemcpyAtoD

   cudaError_t cudaMemcpyFromArrayAsync (void * dst, cudaArray_const_t  src,  size_t  wOffset,  size_t  hOffset,
       size_t count, enum cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies  count  bytes  from the CUDA array src starting at the upper left corner (wOffset, hOffset) to the
       memory area pointed to by dst, where kind specifies the direction  of  the  copy,  and  must  be  one  of
       cudaMemcpyHostToHost,   cudaMemcpyHostToDevice,   cudaMemcpyDeviceToHost,   cudaMemcpyDeviceToDevice,  or
       cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in  which  case  the  type  of  transfer  is
       inferred  from  the  pointer  values.  However, cudaMemcpyDefault is only allowed on systems that support
       unified virtual addressing.

       cudaMemcpyFromArrayAsync() is asynchronous with respect to the host, so the call may  return  before  the
       copy  is  complete.  The  copy  can  optionally  be  associated  to a stream by passing a non-zero stream
       argument. If kind is cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost and stream is  non-zero,  the  copy
       may overlap with operations in other streams.

       Parameters:
           dst - Destination memory address
           src - Source memory address
           wOffset - Source starting X offset
           hOffset - Source starting Y offset
           count - Size in bytes to copy
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy,     cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,    cudaMemcpyFromArray,
           cudaMemcpy2DFromArray,    cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,
           cudaMemcpyFromSymbol,        cudaMemcpyAsync,        cudaMemcpy2DAsync,       cudaMemcpyToArrayAsync,
           cudaMemcpy2DToArrayAsync,            cudaMemcpy2DFromArrayAsync,             cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpyAtoHAsync, cuMemcpy2DAsync

   cudaError_t  cudaMemcpyFromSymbol  (void  *  dst,  const void * symbol, size_t count, size_t offset = 0, enum
       cudaMemcpyKind kind = cudaMemcpyDeviceToHost)
       Copies count bytes from the memory area pointed to by offset bytes from the start of symbol symbol to the
       memory area pointed to by dst. The memory areas may not overlap. symbol is a  variable  that  resides  in
       global  or constant memory space. kind can be either cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or
       cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in  which  case  the  type  of  transfer  is
       inferred  from  the  pointer  values.  However, cudaMemcpyDefault is only allowed on systems that support
       unified virtual addressing.

       Parameters:
           dst - Destination memory address
           symbol - Device symbol address
           count - Size in bytes to copy
           offset - Offset from start of symbol in bytes
           kind - Type of transfer

       Returns:
           cudaSuccess,    cudaErrorInvalidValue,    cudaErrorInvalidSymbol,    cudaErrorInvalidMemcpyDirection,
           cudaErrorNoKernelImageForDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           Use  of  a string naming a variable as the symbol parameter was deprecated in CUDA 4.1 and removed in
           CUDA 5.0.

       See also:
           cudaMemcpy,    cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,     cudaMemcpyFromArray,
           cudaMemcpy2DFromArray,    cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,
           cudaMemcpyAsync,      cudaMemcpy2DAsync,      cudaMemcpyToArrayAsync,       cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,             cudaMemcpy2DFromArrayAsync,            cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpy, cuMemcpyDtoH, cuMemcpyDtoD

   cudaError_t cudaMemcpyFromSymbolAsync (void * dst, const void * symbol, size_t  count,  size_t  offset,  enum
       cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies count bytes from the memory area pointed to by offset bytes from the start of symbol symbol to the
       memory  area  pointed  to  by dst. The memory areas may not overlap. symbol is a variable that resides in
       global or constant memory space. kind can be either cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice,  or
       cudaMemcpyDefault.  Passing  cudaMemcpyDefault  is  recommended,  in  which  case the type of transfer is
       inferred from the pointer values. However, cudaMemcpyDefault is only  allowed  on  systems  that  support
       unified virtual addressing.

       cudaMemcpyFromSymbolAsync()  is  asynchronous with respect to the host, so the call may return before the
       copy is complete. The copy can optionally be  associated  to  a  stream  by  passing  a  non-zero  stream
       argument.  If kind is cudaMemcpyDeviceToHost and stream is non-zero, the copy may overlap with operations
       in other streams.

       Parameters:
           dst - Destination memory address
           symbol - Device symbol address
           count - Size in bytes to copy
           offset - Offset from start of symbol in bytes
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess,    cudaErrorInvalidValue,    cudaErrorInvalidSymbol,    cudaErrorInvalidMemcpyDirection,
           cudaErrorNoKernelImageForDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

           Use  of  a string naming a variable as the symbol parameter was deprecated in CUDA 4.1 and removed in
           CUDA 5.0.

       See also:
           cudaMemcpy,    cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,     cudaMemcpyFromArray,
           cudaMemcpy2DFromArray,    cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,
           cudaMemcpyFromSymbol,       cudaMemcpyAsync,        cudaMemcpy2DAsync,        cudaMemcpyToArrayAsync,
           cudaMemcpy2DToArrayAsync,            cudaMemcpyFromArrayAsync,            cudaMemcpy2DFromArrayAsync,
           cudaMemcpyToSymbolAsync, cuMemcpyAsync, cuMemcpyDtoHAsync, cuMemcpyDtoDAsync

   cudaError_t cudaMemcpyPeer (void * dst, int dstDevice, const void * src, int srcDevice, size_t count)
       Copies memory from one device to memory on another  device.  dst  is  the  base  device  pointer  of  the
       destination  memory and dstDevice is the destination device. src is the base device pointer of the source
       memory and srcDevice is the source device. count specifies the number of bytes to copy.

       Note that this function is asynchronous with respect to the host, but serialized with respect all pending
       and future asynchronous work in to the current device, srcDevice, and dstDevice (use  cudaMemcpyPeerAsync
       to avoid this synchronization).

       Parameters:
           dst - Destination device pointer
           dstDevice - Destination device
           src - Source device pointer
           srcDevice - Source device
           count - Size of memory copy in bytes

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy, cudaMemcpyAsync, cudaMemcpyPeerAsync, cudaMemcpy3DPeerAsync, cuMemcpyPeer

   cudaError_t  cudaMemcpyPeerAsync  (void  * dst, int dstDevice, const void * src, int srcDevice, size_t count,
       cudaStream_t stream = 0)
       Copies memory from one device to memory on another  device.  dst  is  the  base  device  pointer  of  the
       destination  memory and dstDevice is the destination device. src is the base device pointer of the source
       memory and srcDevice is the source device. count specifies the number of bytes to copy.

       Note that this function is asynchronous with respect to the host and all work on other devices.

       Parameters:
           dst - Destination device pointer
           dstDevice - Destination device
           src - Source device pointer
           srcDevice - Source device
           count - Size of memory copy in bytes
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpyPeer, cudaMemcpyAsync, cudaMemcpy3DPeerAsync, cuMemcpyPeerAsync

   cudaError_t cudaMemcpyToArray (cudaArray_t dst, size_t wOffset, size_t hOffset,  const  void  *  src,  size_t
       count, enum cudaMemcpyKind kind)
       Copies  count  bytes  from  the memory area pointed to by src to the CUDA array dst starting at the upper
       left corner (wOffset, hOffset), where kind specifies the direction of  the  copy,  and  must  be  one  of
       cudaMemcpyHostToHost,   cudaMemcpyHostToDevice,   cudaMemcpyDeviceToHost,   cudaMemcpyDeviceToDevice,  or
       cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in  which  case  the  type  of  transfer  is
       inferred  from  the  pointer  values.  However, cudaMemcpyDefault is only allowed on systems that support
       unified virtual addressing.

       Parameters:
           dst - Destination memory address
           wOffset - Destination starting X offset
           hOffset - Destination starting Y offset
           src - Source memory address
           count - Size in bytes to copy
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy,   cudaMemcpy2D,    cudaMemcpy2DToArray,    cudaMemcpyFromArray,    cudaMemcpy2DFromArray,
           cudaMemcpyArrayToArray,     cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,    cudaMemcpyFromSymbol,
           cudaMemcpyAsync,      cudaMemcpy2DAsync,      cudaMemcpyToArrayAsync,       cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,             cudaMemcpy2DFromArrayAsync,            cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpyHtoA, cuMemcpyDtoA

   cudaError_t cudaMemcpyToArrayAsync (cudaArray_t dst, size_t wOffset, size_t hOffset, const void * src, size_t
       count, enum cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies count bytes from the memory area pointed to by src to the CUDA array dst  starting  at  the  upper
       left  corner  (wOffset,  hOffset),  where  kind  specifies  the direction of the copy, and must be one of
       cudaMemcpyHostToHost,  cudaMemcpyHostToDevice,   cudaMemcpyDeviceToHost,   cudaMemcpyDeviceToDevice,   or
       cudaMemcpyDefault.  Passing  cudaMemcpyDefault  is  recommended,  in  which  case the type of transfer is
       inferred from the pointer values. However, cudaMemcpyDefault is only  allowed  on  systems  that  support
       unified virtual addressing.

       cudaMemcpyToArrayAsync() is asynchronous with respect to the host, so the call may return before the copy
       is  complete. The copy can optionally be associated to a stream by passing a non-zero stream argument. If
       kind is cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost and stream is non-zero,  the  copy  may  overlap
       with operations in other streams.

       Parameters:
           dst - Destination memory address
           wOffset - Destination starting X offset
           hOffset - Destination starting Y offset
           src - Source memory address
           count - Size in bytes to copy
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy,     cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,    cudaMemcpyFromArray,
           cudaMemcpy2DFromArray,    cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,
           cudaMemcpyFromSymbol,       cudaMemcpyAsync,       cudaMemcpy2DAsync,       cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,            cudaMemcpy2DFromArrayAsync,             cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpyHtoAAsync, cuMemcpy2DAsync

   cudaError_t  cudaMemcpyToSymbol (const void * symbol, const void * src, size_t count, size_t offset = 0, enum
       cudaMemcpyKind kind = cudaMemcpyHostToDevice)
       Copies count bytes from the memory area pointed to by src to the memory area pointed to by  offset  bytes
       from  the  start of symbol symbol. The memory areas may not overlap. symbol is a variable that resides in
       global or constant memory space. kind can be either cudaMemcpyHostToDevice, cudaMemcpyDeviceToDevice,  or
       cudaMemcpyDefault.  Passing  cudaMemcpyDefault  is  recommended,  in  which  case the type of transfer is
       inferred from the pointer values. However, cudaMemcpyDefault is only  allowed  on  systems  that  support
       unified virtual addressing.

       Parameters:
           symbol - Device symbol address
           src - Source memory address
           count - Size in bytes to copy
           offset - Offset from start of symbol in bytes
           kind - Type of transfer

       Returns:
           cudaSuccess,    cudaErrorInvalidValue,    cudaErrorInvalidSymbol,    cudaErrorInvalidMemcpyDirection,
           cudaErrorNoKernelImageForDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           Use of a string naming a variable as the symbol parameter was deprecated in CUDA 4.1 and  removed  in
           CUDA 5.0.

       See also:
           cudaMemcpy,     cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,    cudaMemcpyFromArray,
           cudaMemcpy2DFromArray,   cudaMemcpyArrayToArray,   cudaMemcpy2DArrayToArray,    cudaMemcpyFromSymbol,
           cudaMemcpyAsync,       cudaMemcpy2DAsync,      cudaMemcpyToArrayAsync,      cudaMemcpy2DToArrayAsync,
           cudaMemcpyFromArrayAsync,            cudaMemcpy2DFromArrayAsync,             cudaMemcpyToSymbolAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpy, cuMemcpyHtoD, cuMemcpyDtoD

   cudaError_t cudaMemcpyToSymbolAsync (const void * symbol, const void * src, size_t count, size_t offset, enum
       cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies  count  bytes from the memory area pointed to by src to the memory area pointed to by offset bytes
       from the start of symbol symbol. The memory areas may not overlap. symbol is a variable that  resides  in
       global  or constant memory space. kind can be either cudaMemcpyHostToDevice, cudaMemcpyDeviceToDevice, or
       cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in  which  case  the  type  of  transfer  is
       inferred  from  the  pointer  values.  However, cudaMemcpyDefault is only allowed on systems that support
       unified virtual addressing.

       cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so the call  may  return  before  the
       copy  is  complete.  The  copy  can  optionally  be  associated  to a stream by passing a non-zero stream
       argument. If kind is cudaMemcpyHostToDevice and stream is non-zero, the copy may overlap with  operations
       in other streams.

       Parameters:
           symbol - Device symbol address
           src - Source memory address
           count - Size in bytes to copy
           offset - Offset from start of symbol in bytes
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess,    cudaErrorInvalidValue,    cudaErrorInvalidSymbol,    cudaErrorInvalidMemcpyDirection,
           cudaErrorNoKernelImageForDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

           Use of a string naming a variable as the symbol parameter was deprecated in CUDA 4.1 and  removed  in
           CUDA 5.0.

       See also:
           cudaMemcpy,     cudaMemcpy2D,     cudaMemcpyToArray,     cudaMemcpy2DToArray,    cudaMemcpyFromArray,
           cudaMemcpy2DFromArray,    cudaMemcpyArrayToArray,    cudaMemcpy2DArrayToArray,    cudaMemcpyToSymbol,
           cudaMemcpyFromSymbol,        cudaMemcpyAsync,        cudaMemcpy2DAsync,       cudaMemcpyToArrayAsync,
           cudaMemcpy2DToArrayAsync,            cudaMemcpyFromArrayAsync,            cudaMemcpy2DFromArrayAsync,
           cudaMemcpyFromSymbolAsync, cuMemcpyAsync, cuMemcpyHtoDAsync, cuMemcpyDtoDAsync

   cudaError_t cudaMemGetInfo (size_t * free, size_t * total)
       Returns in *free and *total respectively, the free and total amount of memory available for allocation by
       the device in bytes.

       Parameters:
           free - Returned free memory in bytes
           total - Returned total memory in bytes

       Returns:
           cudaSuccess, cudaErrorInitializationError, cudaErrorInvalidValue, cudaErrorLaunchFailure

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cuMemGetInfo

   cudaError_t cudaMemPrefetchAsync (const void * devPtr, size_t count, int dstDevice, cudaStream_t stream = 0)
       Prefetches memory to the specified destination device. devPtr is the base device pointer of the memory to
       be  prefetched  and  dstDevice  is  the  destination device. count specifies the number of bytes to copy.
       stream is the stream in which the operation is enqueued. The memory range must refer  to  managed  memory
       allocated via cudaMallocManaged or declared via __managed__ variables.

       Passing  in  cudaCpuDeviceId  for dstDevice will prefetch the data to host memory. If dstDevice is a GPU,
       then the device attribute cudaDevAttrConcurrentManagedAccess must be non-zero. Additionally, stream  must
       be    associated   with   a   device   that   has   a   non-zero   value   for   the   device   attribute
       cudaDevAttrConcurrentManagedAccess.

       The start address and end address of the memory range will be rounded down and rounded up respectively to
       be aligned to CPU page size before the prefetch operation is enqueued in the stream.

       If no physical memory has been allocated for this region, then this memory region will be  populated  and
       mapped  on  the  destination  device.  If there's insufficient memory to prefetch the desired region, the
       Unified Memory driver may evict pages from other cudaMallocManaged allocations to host memory in order to
       make room. Device memory allocated using cudaMalloc or cudaMallocArray will not be evicted.

       By default, any mappings to the previous location of the migrated pages are removed and mappings for  the
       new location are only setup on dstDevice. The exact behavior however also depends on the settings applied
       to this memory range via cudaMemAdvise as described below:

       If  cudaMemAdviseSetReadMostly was set on any subset of this memory range, then that subset will create a
       read-only copy of the pages on dstDevice.

       If cudaMemAdviseSetPreferredLocation was called on any subset of this memory range, then the  pages  will
       be  migrated  to  dstDevice  even  if  dstDevice is not the preferred location of any pages in the memory
       range.

       If cudaMemAdviseSetAccessedBy was called on any subset of this memory range, then mappings to those pages
       from all the appropriate processors are updated to refer to the  new  location  if  establishing  such  a
       mapping is possible. Otherwise, those mappings are cleared.

       Note  that  this API is not required for functionality and only serves to improve performance by allowing
       the application to migrate data to a suitable location before it is accessed.  Memory  accesses  to  this
       range are always coherent and are allowed even when the data is actively being migrated.

       Note that this function is asynchronous with respect to the host and all work on other devices.

       Parameters:
           devPtr - Pointer to be prefetched
           count - Size in bytes
           dstDevice - Destination device to prefetch to
           stream - Stream to enqueue prefetch operation

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpyPeer, cudaMemcpyAsync, cudaMemcpy3DPeerAsync, cudaMemAdvise, cuMemPrefetchAsync

   cudaError_t  cudaMemRangeGetAttribute  (void  *  data, size_t dataSize, enum cudaMemRangeAttribute attribute,
       const void * devPtr, size_t count)
       Query an attribute about the memory range starting at devPtr with a size of count bytes. The memory range
       must refer to managed memory allocated via cudaMallocManaged or declared via __managed__ variables.

       The attribute parameter can take the following values:

       • cudaMemRangeAttributeReadMostly: If this attribute is specified, data will be interpreted as  a  32-bit
         integer,  and  dataSize must be 4. The result returned will be 1 if all pages in the given memory range
         have read-duplication enabled, or 0 otherwise.

       • cudaMemRangeAttributePreferredLocation: If this attribute is specified, data will be interpreted  as  a
         32-bit integer, and dataSize must be 4. The result returned will be a GPU device id if all pages in the
         memory  range  have that GPU as their preferred location, or it will be cudaCpuDeviceId if all pages in
         the memory range have the CPU as their preferred location, or it will be cudaInvalidDeviceId if  either
         all  the  pages  don't  have  the  same  preferred location or some of the pages don't have a preferred
         location at all. Note that the actual location of the pages in the memory range  at  the  time  of  the
         query may be different from the preferred location.

       • cudaMemRangeAttributeAccessedBy:  If  this attribute is specified, data will be interpreted as an array
         of 32-bit integers, and dataSize must be a non-zero multiple of 4. The result returned will be  a  list
         of  device ids that had cudaMemAdviceSetAccessedBy set for that entire memory range. If any device does
         not have that advice set for the entire memory range, that device will not  be  included.  If  data  is
         larger  than the number of devices that have that advice set for that memory range, cudaInvalidDeviceId
         will be returned in all the extra space provided. For ex., if dataSize is 12 (i.e. data has 3 elements)
         and only device 0 has the advice set, then the  result  returned  will  be  {  0,  cudaInvalidDeviceId,
         cudaInvalidDeviceId  }.  If  data is smaller than the number of devices that have that advice set, then
         only as many devices will be returned as can fit in the array. There is no guarantee on which  specific
         devices will be returned, however.

       • cudaMemRangeAttributeLastPrefetchLocation:  If this attribute is specified, data will be interpreted as
         a 32-bit integer, and dataSize must be 4. The result returned will be the last location  to  which  all
         pages  in  the  memory range were prefetched explicitly via cudaMemPrefetchAsync. This will either be a
         GPU id or cudaCpuDeviceId depending on whether the last location for prefetch was  a  GPU  or  the  CPU
         respectively.  If any page in the memory range was never explicitly prefetched or if all pages were not
         prefetched to the same location, cudaInvalidDeviceId will be returned. Note that  this  simply  returns
         the last location that the applicaton requested to prefetch the memory range to. It gives no indication
         as to whether the prefetch operation to that location has completed or even begun.

       Parameters:
           data - A pointers to a memory location where the result of each attribute query will be written to.
           dataSize - Array containing the size of data
           attribute - The attribute to query
           devPtr - Start of the range to query
           count - Size of the range to query

       Returns:
           cudaSuccess, cudaErrorInvalidValue

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemRangeGetAttributes, cudaMemPrefetchAsync, cudaMemAdvise, cuMemRangeGetAttribute

   cudaError_t  cudaMemRangeGetAttributes  (void  **  data,  size_t  *  dataSizes,  enum cudaMemRangeAttribute *
       attributes, size_t numAttributes, const void * devPtr, size_t count)
       Query attributes of the memory range starting at devPtr with a size of count bytes. The memory range must
       refer to managed memory allocated via  cudaMallocManaged  or  declared  via  __managed__  variables.  The
       attributes  array  will  be  interpreted  to have numAttributes entries. The dataSizes array will also be
       interpreted to have numAttributes entries. The results of the query will be stored in data.

       The list of supported attributes are given below. Please refer to cudaMemRangeGetAttribute for  attribute
       descriptions and restrictions.

       • cudaMemRangeAttributeReadMostly

       • cudaMemRangeAttributePreferredLocation

       • cudaMemRangeAttributeAccessedBy

       • cudaMemRangeAttributeLastPrefetchLocation

       Parameters:
           data  -  A  two-dimensional  array  containing  pointers to memory locations where the result of each
           attribute query will be written to.
           dataSizes - Array containing the sizes of each result
           attributes - An array of attributes to query (numAttributes and the  number  of  attributes  in  this
           array should match)
           numAttributes - Number of attributes to query
           devPtr - Start of the range to query
           count - Size of the range to query

       Returns:
           cudaSuccess, cudaErrorInvalidValue

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

       See also:
           cudaMemRangeGetAttribute, cudaMemAdvise cudaMemPrefetchAsync, cuMemRangeGetAttributes

   cudaError_t cudaMemset (void * devPtr, int value, size_t count)
       Fills the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

       Note  that  this  function  is  asynchronous with respect to the host unless devPtr refers to pinned host
       memory.

       Parameters:
           devPtr - Pointer to device memory
           value - Value to set for each byte of specified memory
           count - Size in bytes to set

       Returns:
           cudaSuccess, cudaErrorInvalidValue,

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           See also .

       See also:
           cuMemsetD8, cuMemsetD16, cuMemsetD32

   cudaError_t cudaMemset2D (void * devPtr, size_t pitch, int value, size_t width, size_t height)
       Sets to the specified value value a matrix (height rows of width bytes each) pointed to by dstPtr.  pitch
       is  the  width  in  bytes of the 2D array pointed to by dstPtr, including any padding added to the end of
       each row. This  function  performs  fastest  when  the  pitch  is  one  that  has  been  passed  back  by
       cudaMallocPitch().

       Note  that  this  function  is  asynchronous with respect to the host unless devPtr refers to pinned host
       memory.

       Parameters:
           devPtr - Pointer to 2D device memory
           pitch - Pitch in bytes of 2D device memory
           value - Value to set for each byte of specified memory
           width - Width of matrix set (columns in bytes)
           height - Height of matrix set (rows)

       Returns:
           cudaSuccess, cudaErrorInvalidValue,

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           See also .

       See also:
           cudaMemset,  cudaMemset3D,  cudaMemsetAsync,  cudaMemset2DAsync,   cudaMemset3DAsync,   cuMemsetD2D8,
           cuMemsetD2D16, cuMemsetD2D32

   __cudart_builtin__  cudaError_t  cudaMemset2DAsync  (void  *  devPtr,  size_t pitch, int value, size_t width,
       size_t height, cudaStream_t stream = 0)
       Sets to the specified value value a matrix (height rows of width bytes each) pointed to by dstPtr.  pitch
       is  the  width  in  bytes of the 2D array pointed to by dstPtr, including any padding added to the end of
       each row. This  function  performs  fastest  when  the  pitch  is  one  that  has  been  passed  back  by
       cudaMallocPitch().

       cudaMemset2DAsync() is asynchronous with respect to the host, so the call may return before the memset is
       complete.  The  operation can optionally be associated to a stream by passing a non-zero stream argument.
       If stream is non-zero, the operation may overlap with operations in other streams.

       The device version of this function only handles device to device copies and cannot  be  given  local  or
       shared pointers.

       Parameters:
           devPtr - Pointer to 2D device memory
           pitch - Pitch in bytes of 2D device memory
           value - Value to set for each byte of specified memory
           width - Width of matrix set (columns in bytes)
           height - Height of matrix set (rows)
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue,

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           See also .

           This function uses standard  semantics.

       See also:
           cudaMemset,   cudaMemset2D,   cudaMemset3D,  cudaMemsetAsync,  cudaMemset3DAsync,  cuMemsetD2D8Async,
           cuMemsetD2D16Async, cuMemsetD2D32Async

   cudaError_t cudaMemset3D (struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent)
       Initializes each element of a 3D array to the specified value value. The object to initialize is  defined
       by  pitchedDevPtr.  The  pitch  field  of  pitchedDevPtr  is the width in memory in bytes of the 3D array
       pointed to by pitchedDevPtr, including any padding added  to  the  end  of  each  row.  The  xsize  field
       specifies  the  logical width of each row in bytes, while the ysize field specifies the height of each 2D
       slice in rows.

       The extents of the initialized region are specified as a width in bytes, a height in rows, and a depth in
       slices.

       Extents with width greater than or equal to the xsize of pitchedDevPtr may perform  significantly  faster
       than  extents  narrower  than  the  xsize.  Secondarily,  extents  with  height  equal  to  the  ysize of
       pitchedDevPtr will perform faster than when the height is shorter than the ysize.

       This function performs fastest when the pitchedDevPtr has been allocated by cudaMalloc3D().

       Note that this function is asynchronous with respect to the host unless pitchedDevPtr  refers  to  pinned
       host memory.

       Parameters:
           pitchedDevPtr - Pointer to pitched device memory
           value - Value to set for each byte of specified memory
           extent - Size parameters for where to set device memory (width field in bytes)

       Returns:
           cudaSuccess, cudaErrorInvalidValue,

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           See also .

       See also:
           cudaMemset,   cudaMemset2D,   cudaMemsetAsync,  cudaMemset2DAsync,  cudaMemset3DAsync,  cudaMalloc3D,
           make_cudaPitchedPtr, make_cudaExtent

   __cudart_builtin__ cudaError_t cudaMemset3DAsync (struct  cudaPitchedPtr  pitchedDevPtr,  int  value,  struct
       cudaExtent extent, cudaStream_t stream = 0)
       Initializes  each element of a 3D array to the specified value value. The object to initialize is defined
       by pitchedDevPtr. The pitch field of pitchedDevPtr is the width in  memory  in  bytes  of  the  3D  array
       pointed  to  by  pitchedDevPtr,  including  any  padding  added  to  the end of each row. The xsize field
       specifies the logical width of each row in bytes, while the ysize field specifies the height of  each  2D
       slice in rows.

       The extents of the initialized region are specified as a width in bytes, a height in rows, and a depth in
       slices.

       Extents  with  width greater than or equal to the xsize of pitchedDevPtr may perform significantly faster
       than extents  narrower  than  the  xsize.  Secondarily,  extents  with  height  equal  to  the  ysize  of
       pitchedDevPtr will perform faster than when the height is shorter than the ysize.

       This function performs fastest when the pitchedDevPtr has been allocated by cudaMalloc3D().

       cudaMemset3DAsync() is asynchronous with respect to the host, so the call may return before the memset is
       complete.  The  operation can optionally be associated to a stream by passing a non-zero stream argument.
       If stream is non-zero, the operation may overlap with operations in other streams.

       The device version of this function only handles device to device copies and cannot  be  given  local  or
       shared pointers.

       Parameters:
           pitchedDevPtr - Pointer to pitched device memory
           value - Value to set for each byte of specified memory
           extent - Size parameters for where to set device memory (width field in bytes)
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue,

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           See also .

           This function uses standard  semantics.

       See also:
           cudaMemset,    cudaMemset2D,    cudaMemset3D,   cudaMemsetAsync,   cudaMemset2DAsync,   cudaMalloc3D,
           make_cudaPitchedPtr, make_cudaExtent

   __cudart_builtin__ cudaError_t cudaMemsetAsync (void * devPtr, int value, size_t count, cudaStream_t stream =
       0)
       Fills the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

       cudaMemsetAsync() is asynchronous with respect to the host, so the call may return before the  memset  is
       complete.  The  operation can optionally be associated to a stream by passing a non-zero stream argument.
       If stream is non-zero, the operation may overlap with operations in other streams.

       The device version of this function only handles device to device copies and cannot  be  given  local  or
       shared pointers.

       Parameters:
           devPtr - Pointer to device memory
           value - Value to set for each byte of specified memory
           count - Size in bytes to set
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue,

       Note:
           Note that this function may also return error codes from previous, asynchronous launches.

           See also .

           This function uses standard  semantics.

       See also:
           cudaMemset,   cudaMemset2D,   cudaMemset3D,  cudaMemset2DAsync,  cudaMemset3DAsync,  cuMemsetD8Async,
           cuMemsetD16Async, cuMemsetD32Async

   struct cudaExtent make_cudaExtent (size_t w, size_t h, size_t d) [read]
       Returns a cudaExtent based on the specified input parameters w, h, and d.

       Parameters:
           w - Width in elements when referring to array memory, in bytes when referring to linear memory
           h - Height in elements
           d - Depth in elements

       Returns:
           cudaExtent specified by w, h, and d

       See also:
           make_cudaPitchedPtr, make_cudaPos

   struct cudaPitchedPtr make_cudaPitchedPtr (void * d, size_t p, size_t xsz, size_t ysz) [read]
       Returns a cudaPitchedPtr based on the specified input parameters d, p, xsz, and ysz.

       Parameters:
           d - Pointer to allocated memory
           p - Pitch of allocated memory in bytes
           xsz - Logical width of allocation in elements
           ysz - Logical height of allocation in elements

       Returns:
           cudaPitchedPtr specified by d, p, xsz, and ysz

       See also:
           make_cudaExtent, make_cudaPos

   struct cudaPos make_cudaPos (size_t x, size_t y, size_t z) [read]
       Returns a cudaPos based on the specified input parameters x, y, and z.

       Parameters:
           x - X position
           y - Y position
           z - Z position

       Returns:
           cudaPos specified by x, y, and z

       See also:
           make_cudaExtent, make_cudaPitchedPtr

Author

       Generated automatically by Doxygen from the source code.

Version 6.0                                        3 Nov 2017                               Memory Management(3)