Provided by: nvidia-cuda-dev_9.1.85-3ubuntu1_amd64 bug

NAME

       Memory Management -

   Functions
       cudaError_t cudaArrayGetInfo (struct cudaChannelFormatDesc *desc, struct cudaExtent
           *extent, unsigned int *flags, cudaArray_t array)
           Gets info about the specified cudaArray.
       __cudart_builtin__ cudaError_t cudaFree (void *devPtr)
           Frees memory on the device.
       cudaError_t cudaFreeArray (cudaArray_t array)
           Frees an array on the device.
       cudaError_t cudaFreeHost (void *ptr)
           Frees page-locked memory.
       cudaError_t cudaFreeMipmappedArray (cudaMipmappedArray_t mipmappedArray)
           Frees a mipmapped array on the device.
       cudaError_t cudaGetMipmappedArrayLevel (cudaArray_t *levelArray,
           cudaMipmappedArray_const_t mipmappedArray, unsigned int level)
           Gets a mipmap level of a CUDA mipmapped array.
       cudaError_t cudaGetSymbolAddress (void **devPtr, const void *symbol)
           Finds the address associated with a CUDA symbol.
       cudaError_t cudaGetSymbolSize (size_t *size, const void *symbol)
           Finds the size of the object associated with a CUDA symbol.
       cudaError_t cudaHostAlloc (void **pHost, size_t size, unsigned int flags)
           Allocates page-locked memory on the host.
       cudaError_t cudaHostGetDevicePointer (void **pDevice, void *pHost, unsigned int flags)
           Passes back device pointer of mapped host memory allocated by cudaHostAlloc or
           registered by cudaHostRegister.
       cudaError_t cudaHostGetFlags (unsigned int *pFlags, void *pHost)
           Passes back flags used to allocate pinned host memory allocated by cudaHostAlloc.
       cudaError_t cudaHostRegister (void *ptr, size_t size, unsigned int flags)
           Registers an existing host memory range for use by CUDA.
       cudaError_t cudaHostUnregister (void *ptr)
           Unregisters a memory range that was registered with cudaHostRegister.
       __cudart_builtin__ cudaError_t cudaMalloc (void **devPtr, size_t size)
           Allocate memory on the device.
       cudaError_t cudaMalloc3D (struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent)
           Allocates logical 1D, 2D, or 3D memory objects on the device.
       cudaError_t cudaMalloc3DArray (cudaArray_t *array, const struct cudaChannelFormatDesc
           *desc, struct cudaExtent extent, unsigned int flags=0)
           Allocate an array on the device.
       cudaError_t cudaMallocArray (cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
           size_t width, size_t height=0, unsigned int flags=0)
           Allocate an array on the device.
       cudaError_t cudaMallocHost (void **ptr, size_t size)
           Allocates page-locked memory on the host.
       __cudart_builtin__ cudaError_t cudaMallocManaged (void **devPtr, size_t size, unsigned int
           flags=0x01)
           Allocates memory that will be automatically managed by the Unified Memory system.
       cudaError_t cudaMallocMipmappedArray (cudaMipmappedArray_t *mipmappedArray, const struct
           cudaChannelFormatDesc *desc, struct cudaExtent extent, unsigned int numLevels,
           unsigned int flags=0)
           Allocate a mipmapped array on the device.
       cudaError_t cudaMallocPitch (void **devPtr, size_t *pitch, size_t width, size_t height)
           Allocates pitched memory on the device.
       cudaError_t cudaMemAdvise (const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
           int device)
           Advise about the usage of a given memory range.
       cudaError_t cudaMemcpy (void *dst, const void *src, size_t count, enum cudaMemcpyKind
           kind)
           Copies data between host and device.
       cudaError_t cudaMemcpy2D (void *dst, size_t dpitch, const void *src, size_t spitch, size_t
           width, size_t height, enum cudaMemcpyKind kind)
           Copies data between host and device.
       cudaError_t cudaMemcpy2DArrayToArray (cudaArray_t dst, size_t wOffsetDst, size_t
           hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
           size_t height, enum cudaMemcpyKind kind=cudaMemcpyDeviceToDevice)
           Copies data between host and device.
       __cudart_builtin__ cudaError_t cudaMemcpy2DAsync (void *dst, size_t dpitch, const void
           *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
           cudaStream_t stream=0)
           Copies data between host and device.
       cudaError_t cudaMemcpy2DFromArray (void *dst, size_t dpitch, cudaArray_const_t src, size_t
           wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind)
           Copies data between host and device.
       cudaError_t cudaMemcpy2DFromArrayAsync (void *dst, size_t dpitch, cudaArray_const_t src,
           size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
           cudaStream_t stream=0)
           Copies data between host and device.
       cudaError_t cudaMemcpy2DToArray (cudaArray_t dst, size_t wOffset, size_t hOffset, const
           void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind)
           Copies data between host and device.
       cudaError_t cudaMemcpy2DToArrayAsync (cudaArray_t dst, size_t wOffset, size_t hOffset,
           const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
           cudaStream_t stream=0)
           Copies data between host and device.
       cudaError_t cudaMemcpy3D (const struct cudaMemcpy3DParms *p)
           Copies data between 3D objects.
       __cudart_builtin__ cudaError_t cudaMemcpy3DAsync (const struct cudaMemcpy3DParms *p,
           cudaStream_t stream=0)
           Copies data between 3D objects.
       cudaError_t cudaMemcpy3DPeer (const struct cudaMemcpy3DPeerParms *p)
           Copies memory between devices.
       cudaError_t cudaMemcpy3DPeerAsync (const struct cudaMemcpy3DPeerParms *p, cudaStream_t
           stream=0)
           Copies memory between devices asynchronously.
       cudaError_t cudaMemcpyArrayToArray (cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
           cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum
           cudaMemcpyKind kind=cudaMemcpyDeviceToDevice)
           Copies data between host and device.
       __cudart_builtin__ cudaError_t cudaMemcpyAsync (void *dst, const void *src, size_t count,
           enum cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data between host and device.
       cudaError_t cudaMemcpyFromArray (void *dst, cudaArray_const_t src, size_t wOffset, size_t
           hOffset, size_t count, enum cudaMemcpyKind kind)
           Copies data between host and device.
       cudaError_t cudaMemcpyFromArrayAsync (void *dst, cudaArray_const_t src, size_t wOffset,
           size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data between host and device.
       cudaError_t cudaMemcpyFromSymbol (void *dst, const void *symbol, size_t count, size_t
           offset=0, enum cudaMemcpyKind kind=cudaMemcpyDeviceToHost)
           Copies data from the given symbol on the device.
       cudaError_t cudaMemcpyFromSymbolAsync (void *dst, const void *symbol, size_t count, size_t
           offset, enum cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data from the given symbol on the device.
       cudaError_t cudaMemcpyPeer (void *dst, int dstDevice, const void *src, int srcDevice,
           size_t count)
           Copies memory between two devices.
       cudaError_t cudaMemcpyPeerAsync (void *dst, int dstDevice, const void *src, int srcDevice,
           size_t count, cudaStream_t stream=0)
           Copies memory between two devices asynchronously.
       cudaError_t cudaMemcpyToArray (cudaArray_t dst, size_t wOffset, size_t hOffset, const void
           *src, size_t count, enum cudaMemcpyKind kind)
           Copies data between host and device.
       cudaError_t cudaMemcpyToArrayAsync (cudaArray_t dst, size_t wOffset, size_t hOffset, const
           void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data between host and device.
       cudaError_t cudaMemcpyToSymbol (const void *symbol, const void *src, size_t count, size_t
           offset=0, enum cudaMemcpyKind kind=cudaMemcpyHostToDevice)
           Copies data to the given symbol on the device.
       cudaError_t cudaMemcpyToSymbolAsync (const void *symbol, const void *src, size_t count,
           size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream=0)
           Copies data to the given symbol on the device.
       cudaError_t cudaMemGetInfo (size_t *free, size_t *total)
           Gets free and total device memory.
       cudaError_t cudaMemPrefetchAsync (const void *devPtr, size_t count, int dstDevice,
           cudaStream_t stream=0)
           Prefetches memory to the specified destination device.
       cudaError_t cudaMemRangeGetAttribute (void *data, size_t dataSize, enum
           cudaMemRangeAttribute attribute, const void *devPtr, size_t count)
           Query an attribute of a given memory range.
       cudaError_t cudaMemRangeGetAttributes (void **data, size_t *dataSizes, enum
           cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t
           count)
           Query attributes of a given memory range.
       cudaError_t cudaMemset (void *devPtr, int value, size_t count)
           Initializes or sets device memory to a value.
       cudaError_t cudaMemset2D (void *devPtr, size_t pitch, int value, size_t width, size_t
           height)
           Initializes or sets device memory to a value.
       __cudart_builtin__ cudaError_t cudaMemset2DAsync (void *devPtr, size_t pitch, int value,
           size_t width, size_t height, cudaStream_t stream=0)
           Initializes or sets device memory to a value.
       cudaError_t cudaMemset3D (struct cudaPitchedPtr pitchedDevPtr, int value, struct
           cudaExtent extent)
           Initializes or sets device memory to a value.
       __cudart_builtin__ cudaError_t cudaMemset3DAsync (struct cudaPitchedPtr pitchedDevPtr, int
           value, struct cudaExtent extent, cudaStream_t stream=0)
           Initializes or sets device memory to a value.
       __cudart_builtin__ cudaError_t cudaMemsetAsync (void *devPtr, int value, size_t count,
           cudaStream_t stream=0)
           Initializes or sets device memory to a value.
       struct cudaExtent make_cudaExtent (size_t w, size_t h, size_t d)
           Returns a cudaExtent based on input parameters.
       struct cudaPitchedPtr make_cudaPitchedPtr (void *d, size_t p, size_t xsz, size_t ysz)
           Returns a cudaPitchedPtr based on input parameters.
       struct cudaPos make_cudaPos (size_t x, size_t y, size_t z)
           Returns a cudaPos based on input parameters.

Detailed Description

       \brief memory management functions of the CUDA runtime API (cuda_runtime_api.h)

       This section describes the memory management functions of the CUDA runtime application
       programming interface.

       Some functions have overloaded C++ API template versions documented separately in the C++
       API Routines module.

Function Documentation

   cudaError_t cudaArrayGetInfo (struct cudaChannelFormatDesc * desc, struct cudaExtent * extent,
       unsigned int * flags, cudaArray_t array)
       Returns in *desc, *extent and *flags respectively, the type, shape and flags of array.

       Any of *desc, *extent and *flags may be specified as NULL.

       Parameters:
           desc - Returned array type
           extent - Returned array shape. 2D arrays will have depth of zero
           flags - Returned array flags
           array - The cudaArray to get info for

       Returns:
           cudaSuccess, cudaErrorInvalidValue

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cuArrayGetDescriptor, cuArray3DGetDescriptor

   __cudart_builtin__ cudaError_t cudaFree (void * devPtr)
       Frees the memory space pointed to by devPtr, which must have been returned by a previous
       call to cudaMalloc() or cudaMallocPitch(). Otherwise, or if cudaFree(devPtr) has already
       been called before, an error is returned. If devPtr is 0, no operation is performed.
       cudaFree() returns cudaErrorInvalidDevicePointer in case of failure.

       The device version of cudaFree cannot be used with a *devPtr allocated using the host API,
       and vice versa.

       Parameters:
           devPtr - Device pointer to memory to free

       Returns:
           cudaSuccess, cudaErrorInvalidDevicePointer, cudaErrorInitializationError

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMalloc, cudaMallocPitch, cudaMallocArray, cudaFreeArray, cudaMallocHost (C API),
           cudaFreeHost, cudaMalloc3D, cudaMalloc3DArray, cudaHostAlloc, cuMemFree

   cudaError_t cudaFreeArray (cudaArray_t array)
       Frees the CUDA array array, which must have been * returned by a previous call to
       cudaMallocArray(). If cudaFreeArray(array) has already been called before,
       cudaErrorInvalidValue is returned. If devPtr is 0, no operation is performed.

       Parameters:
           array - Pointer to array to free

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInitializationError

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMalloc, cudaMallocPitch, cudaFree, cudaMallocArray, cudaMallocHost (C API),
           cudaFreeHost, cudaHostAlloc, cuArrayDestroy

   cudaError_t cudaFreeHost (void * ptr)
       Frees the memory space pointed to by hostPtr, which must have been returned by a previous
       call to cudaMallocHost() or cudaHostAlloc().

       Parameters:
           ptr - Pointer to memory to free

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInitializationError

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMalloc, cudaMallocPitch, cudaFree, cudaMallocArray, cudaFreeArray, cudaMallocHost
           (C API), cudaMalloc3D, cudaMalloc3DArray, cudaHostAlloc, cuMemFreeHost

   cudaError_t cudaFreeMipmappedArray (cudaMipmappedArray_t mipmappedArray)
       Frees the CUDA mipmapped array mipmappedArray, which must have been returned by a previous
       call to cudaMallocMipmappedArray(). If cudaFreeMipmappedArray(mipmappedArray) has already
       been called before, cudaErrorInvalidValue is returned.

       Parameters:
           mipmappedArray - Pointer to mipmapped array to free

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInitializationError

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMalloc, cudaMallocPitch, cudaFree, cudaMallocArray, cudaMallocHost (C API),
           cudaFreeHost, cudaHostAlloc, cuMipmappedArrayDestroy

   cudaError_t cudaGetMipmappedArrayLevel (cudaArray_t * levelArray, cudaMipmappedArray_const_t
       mipmappedArray, unsigned int level)
       Returns in *levelArray a CUDA array that represents a single mipmap level of the CUDA
       mipmapped array mipmappedArray.

       If level is greater than the maximum number of levels in this mipmapped array,
       cudaErrorInvalidValue is returned.

       Parameters:
           levelArray - Returned mipmap level CUDA array
           mipmappedArray - CUDA mipmapped array
           level - Mipmap level

       Returns:
           cudaSuccess, cudaErrorInvalidValue

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMalloc3D, cudaMalloc, cudaMallocPitch, cudaFree, cudaFreeArray, cudaMallocHost (C
           API), cudaFreeHost, cudaHostAlloc, make_cudaExtent, cuMipmappedArrayGetLevel

   cudaError_t cudaGetSymbolAddress (void ** devPtr, const void * symbol)
       Returns in *devPtr the address of symbol symbol on the device. symbol is a variable that
       resides in global or constant memory space. If symbol cannot be found, or if symbol is not
       declared in the global or constant memory space, *devPtr is unchanged and the error
       cudaErrorInvalidSymbol is returned.

       Parameters:
           devPtr - Return device pointer associated with symbol
           symbol - Device symbol address

       Returns:
           cudaSuccess, cudaErrorInvalidSymbol, cudaErrorNoKernelImageForDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           Use of a string naming a variable as the symbol parameter was deprecated in CUDA 4.1
           and removed in CUDA 5.0.

       See also:
           cudaGetSymbolAddress (C++ API), cudaGetSymbolSize (C API), cuModuleGetGlobal

   cudaError_t cudaGetSymbolSize (size_t * size, const void * symbol)
       Returns in *size the size of symbol symbol. symbol is a variable that resides in global or
       constant memory space. If symbol cannot be found, or if symbol is not declared in global
       or constant memory space, *size is unchanged and the error cudaErrorInvalidSymbol is
       returned.

       Parameters:
           size - Size of object associated with symbol
           symbol - Device symbol address

       Returns:
           cudaSuccess, cudaErrorInvalidSymbol, cudaErrorNoKernelImageForDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           Use of a string naming a variable as the symbol parameter was deprecated in CUDA 4.1
           and removed in CUDA 5.0.

       See also:
           cudaGetSymbolAddress (C API), cudaGetSymbolSize (C++ API), cuModuleGetGlobal

   cudaError_t cudaHostAlloc (void ** pHost, size_t size, unsigned int flags)
       Allocates size bytes of host memory that is page-locked and accessible to the device. The
       driver tracks the virtual memory ranges allocated with this function and automatically
       accelerates calls to functions such as cudaMemcpy(). Since the memory can be accessed
       directly by the device, it can be read or written with much higher bandwidth than pageable
       memory obtained with functions such as malloc(). Allocating excessive amounts of pinned
       memory may degrade system performance, since it reduces the amount of memory available to
       the system for paging. As a result, this function is best used sparingly to allocate
       staging areas for data exchange between host and device.

       The flags parameter enables different options to be specified that affect the allocation,
       as follows.

       • cudaHostAllocDefault: This flag's value is defined to be 0 and causes cudaHostAlloc() to
         emulate cudaMallocHost().

       • cudaHostAllocPortable: The memory returned by this call will be considered as pinned
         memory by all CUDA contexts, not just the one that performed the allocation.

       • cudaHostAllocMapped: Maps the allocation into the CUDA address space. The device pointer
         to the memory may be obtained by calling cudaHostGetDevicePointer().

       • cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC). WC memory can
         be transferred across the PCI Express bus more quickly on some system configurations,
         but cannot be read efficiently by most CPUs. WC memory is a good option for buffers that
         will be written by the CPU and read by the device via mapped pinned memory or
         host->device transfers.

       All of these flags are orthogonal to one another: a developer may allocate memory that is
       portable, mapped and/or write-combined with no restrictions.

       In order for the cudaHostAllocMapped flag to have any effect, the CUDA context must
       support the cudaDeviceMapHost flag, which can be checked via cudaGetDeviceFlags(). The
       cudaDeviceMapHost flag is implicitly set for contexts created via the runtime API.

       The cudaHostAllocMapped flag may be specified on CUDA contexts for devices that do not
       support mapped pinned memory. The failure is deferred to cudaHostGetDevicePointer()
       because the memory may be mapped into other CUDA contexts via the cudaHostAllocPortable
       flag.

       Memory allocated by this function must be freed with cudaFreeHost().

       Parameters:
           pHost - Device pointer to allocated memory
           size - Requested allocation size in bytes
           flags - Requested properties of allocated memory

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaSetDeviceFlags, cudaMallocHost (C API), cudaFreeHost, cudaGetDeviceFlags,
           cuMemHostAlloc

   cudaError_t cudaHostGetDevicePointer (void ** pDevice, void * pHost, unsigned int flags)
       Passes back the device pointer corresponding to the mapped, pinned host buffer allocated
       by cudaHostAlloc() or registered by cudaHostRegister().

       cudaHostGetDevicePointer() will fail if the cudaDeviceMapHost flag was not specified
       before deferred context creation occurred, or if called on a device that does not support
       mapped, pinned memory.

       For devices that have a non-zero value for the device attribute
       cudaDevAttrCanUseHostPointerForRegisteredMem, the memory can also be accessed from the
       device using the host pointer pHost. The device pointer returned by
       cudaHostGetDevicePointer() may or may not match the original host pointer pHost and
       depends on the devices visible to the application. If all devices visible to the
       application have a non-zero value for the device attribute, the device pointer returned by
       cudaHostGetDevicePointer() will match the original pointer pHost. If any device visible to
       the application has a zero value for the device attribute, the device pointer returned by
       cudaHostGetDevicePointer() will not match the original host pointer pHost, but it will be
       suitable for use on all devices provided Unified Virtual Addressing is enabled. In such
       systems, it is valid to access the memory using either pointer on devices that have a non-
       zero value for the device attribute. Note however that such devices should access the
       memory using only of the two pointers and not both.

       flags provides for future releases. For now, it must be set to 0.

       Parameters:
           pDevice - Returned device pointer for mapped memory
           pHost - Requested host pointer mapping
           flags - Flags for extensions (must be 0 for now)

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaSetDeviceFlags, cudaHostAlloc, cuMemHostGetDevicePointer

   cudaError_t cudaHostGetFlags (unsigned int * pFlags, void * pHost)
       cudaHostGetFlags() will fail if the input pointer does not reside in an address range
       allocated by cudaHostAlloc().

       Parameters:
           pFlags - Returned flags word
           pHost - Host pointer

       Returns:
           cudaSuccess, cudaErrorInvalidValue

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaHostAlloc, cuMemHostGetFlags

   cudaError_t cudaHostRegister (void * ptr, size_t size, unsigned int flags)
       Page-locks the memory range specified by ptr and size and maps it for the device(s) as
       specified by flags. This memory range also is added to the same tracking mechanism as
       cudaHostAlloc() to automatically accelerate calls to functions such as cudaMemcpy(). Since
       the memory can be accessed directly by the device, it can be read or written with much
       higher bandwidth than pageable memory that has not been registered. Page-locking excessive
       amounts of memory may degrade system performance, since it reduces the amount of memory
       available to the system for paging. As a result, this function is best used sparingly to
       register staging areas for data exchange between host and device.

       cudaHostRegister is not supported on non I/O coherent devices.

       The flags parameter enables different options to be specified that affect the allocation,
       as follows.

       • cudaHostRegisterDefault: On a system with unified virtual addressing, the memory will be
         both mapped and portable. On a system with no unified virtual addressing, the memory
         will be neither mapped nor portable.

       • cudaHostRegisterPortable: The memory returned by this call will be considered as pinned
         memory by all CUDA contexts, not just the one that performed the allocation.

       • cudaHostRegisterMapped: Maps the allocation into the CUDA address space. The device
         pointer to the memory may be obtained by calling cudaHostGetDevicePointer().

       • cudaHostRegisterIoMemory: The passed memory pointer is treated as pointing to some
         memory-mapped I/O space, e.g. belonging to a third-party PCIe device, and it will marked
         as non cache-coherent and contiguous.

       All of these flags are orthogonal to one another: a developer may page-lock memory that is
       portable or mapped with no restrictions.

       The CUDA context must have been created with the cudaMapHost flag in order for the
       cudaHostRegisterMapped flag to have any effect.

       The cudaHostRegisterMapped flag may be specified on CUDA contexts for devices that do not
       support mapped pinned memory. The failure is deferred to cudaHostGetDevicePointer()
       because the memory may be mapped into other CUDA contexts via the cudaHostRegisterPortable
       flag.

       For devices that have a non-zero value for the device attribute
       cudaDevAttrCanUseHostPointerForRegisteredMem, the memory can also be accessed from the
       device using the host pointer ptr. The device pointer returned by
       cudaHostGetDevicePointer() may or may not match the original host pointer ptr and depends
       on the devices visible to the application. If all devices visible to the application have
       a non-zero value for the device attribute, the device pointer returned by
       cudaHostGetDevicePointer() will match the original pointer ptr. If any device visible to
       the application has a zero value for the device attribute, the device pointer returned by
       cudaHostGetDevicePointer() will not match the original host pointer ptr, but it will be
       suitable for use on all devices provided Unified Virtual Addressing is enabled. In such
       systems, it is valid to access the memory using either pointer on devices that have a non-
       zero value for the device attribute. Note however that such devices should access the
       memory using only of the two pointers and not both.

       The memory page-locked by this function must be unregistered with cudaHostUnregister().

       Parameters:
           ptr - Host pointer to memory to page-lock
           size - Size in bytes of the address range to page-lock in bytes
           flags - Flags for allocation request

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation,
           cudaErrorHostMemoryAlreadyRegistered, cudaErrorNotSupported

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaHostUnregister, cudaHostGetFlags, cudaHostGetDevicePointer, cuMemHostRegister

   cudaError_t cudaHostUnregister (void * ptr)
       Unmaps the memory range whose base address is specified by ptr, and makes it pageable
       again.

       The base address must be the same one specified to cudaHostRegister().

       Parameters:
           ptr - Host pointer to memory to unregister

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorHostMemoryNotRegistered

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaHostUnregister, cuMemHostUnregister

   __cudart_builtin__ cudaError_t cudaMalloc (void ** devPtr, size_t size)
       Allocates size bytes of linear memory on the device and returns in *devPtr a pointer to
       the allocated memory. The allocated memory is suitably aligned for any kind of variable.
       The memory is not cleared. cudaMalloc() returns cudaErrorMemoryAllocation in case of
       failure.

       The device version of cudaFree cannot be used with a *devPtr allocated using the host API,
       and vice versa.

       Parameters:
           devPtr - Pointer to allocated device memory
           size - Requested allocation size in bytes

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       See also:
           cudaMallocPitch, cudaFree, cudaMallocArray, cudaFreeArray, cudaMalloc3D,
           cudaMalloc3DArray, cudaMallocHost (C API), cudaFreeHost, cudaHostAlloc, cuMemAlloc

   cudaError_t cudaMalloc3D (struct cudaPitchedPtr * pitchedDevPtr, struct cudaExtent extent)
       Allocates at least width * height * depth bytes of linear memory on the device and returns
       a cudaPitchedPtr in which ptr is a pointer to the allocated memory. The function may pad
       the allocation to ensure hardware alignment requirements are met. The pitch returned in
       the pitch field of pitchedDevPtr is the width in bytes of the allocation.

       The returned cudaPitchedPtr contains additional fields xsize and ysize, the logical width
       and height of the allocation, which are equivalent to the width and height extent
       parameters provided by the programmer during allocation.

       For allocations of 2D and 3D objects, it is highly recommended that programmers perform
       allocations using cudaMalloc3D() or cudaMallocPitch(). Due to alignment restrictions in
       the hardware, this is especially true if the application will be performing memory copies
       involving 2D or 3D objects (whether linear memory or CUDA arrays).

       Parameters:
           pitchedDevPtr - Pointer to allocated pitched device memory
           extent - Requested allocation size (width field in bytes)

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMallocPitch, cudaFree, cudaMemcpy3D, cudaMemset3D, cudaMalloc3DArray,
           cudaMallocArray, cudaFreeArray, cudaMallocHost (C API), cudaFreeHost, cudaHostAlloc,
           make_cudaPitchedPtr, make_cudaExtent, cuMemAllocPitch

   cudaError_t cudaMalloc3DArray (cudaArray_t * array, const struct cudaChannelFormatDesc * desc,
       struct cudaExtent extent, unsigned int flags = 0)
       Allocates a CUDA array according to the cudaChannelFormatDesc structure desc and returns a
       handle to the new CUDA array in *array.

       The cudaChannelFormatDesc is defined as:

           struct cudaChannelFormatDesc {
               int x, y, z, w;
               enum cudaChannelFormatKind f;
           };

        where cudaChannelFormatKind is one of cudaChannelFormatKindSigned,
       cudaChannelFormatKindUnsigned, or cudaChannelFormatKindFloat.

       cudaMalloc3DArray() can allocate the following:

       • A 1D array is allocated if the height and depth extents are both zero.

       • A 2D array is allocated if only the depth extent is zero.

       • A 3D array is allocated if all three extents are non-zero.

       • A 1D layered CUDA array is allocated if only the height extent is zero and the
         cudaArrayLayered flag is set. Each layer is a 1D array. The number of layers is
         determined by the depth extent.

       • A 2D layered CUDA array is allocated if all three extents are non-zero and the
         cudaArrayLayered flag is set. Each layer is a 2D array. The number of layers is
         determined by the depth extent.

       • A cubemap CUDA array is allocated if all three extents are non-zero and the
         cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. A
         cubemap is a special type of 2D layered CUDA array, where the six layers represent the
         six faces of a cube. The order of the six layers in memory is the same as that listed in
         cudaGraphicsCubeFace.

       • A cubemap layered CUDA array is allocated if all three extents are non-zero, and both,
         cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and
         depth must be a multiple of six. A cubemap layered CUDA array is a special type of 2D
         layered CUDA array that consists of a collection of cubemaps. The first six layers
         represent the first cubemap, the next six layers form the second cubemap, and so on.

       The flags parameter enables different options to be specified that affect the allocation,
       as follows.

       • cudaArrayDefault: This flag's value is defined to be 0 and provides default array
         allocation

       • cudaArrayLayered: Allocates a layered CUDA array, with the depth extent indicating the
         number of layers

       • cudaArrayCubemap: Allocates a cubemap CUDA array. Width must be equal to height, and
         depth must be six. If the cudaArrayLayered flag is also set, depth must be a multiple of
         six.

       • cudaArraySurfaceLoadStore: Allocates a CUDA array that could be read from or written to
         using a surface reference.

       • cudaArrayTextureGather: This flag indicates that texture gather operations will be
         performed on the CUDA array. Texture gather can only be performed on 2D CUDA arrays.

       The width, height and depth extents must meet certain size requirements as listed in the
       following table. All values are specified in elements.

       Note that 2D CUDA arrays have different size requirements if the cudaArrayTextureGather
       flag is set. In that case, the valid range for (width, height, depth) is
       ((1,maxTexture2DGather[0]), (1,maxTexture2DGather[1]), 0).

       Parameters:
           array - Pointer to allocated array in device memory
           desc - Requested channel format
           extent - Requested allocation size (width field in elements)
           flags - Flags for extensions

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMalloc3D, cudaMalloc, cudaMallocPitch, cudaFree, cudaFreeArray, cudaMallocHost (C
           API), cudaFreeHost, cudaHostAlloc, make_cudaExtent, cuArray3DCreate

   cudaError_t cudaMallocArray (cudaArray_t * array, const struct cudaChannelFormatDesc * desc,
       size_t width, size_t height = 0, unsigned int flags = 0)
       Allocates a CUDA array according to the cudaChannelFormatDesc structure desc and returns a
       handle to the new CUDA array in *array.

       The cudaChannelFormatDesc is defined as:

           struct cudaChannelFormatDesc {
               int x, y, z, w;
           enum cudaChannelFormatKind f;
           };

        where cudaChannelFormatKind is one of cudaChannelFormatKindSigned,
       cudaChannelFormatKindUnsigned, or cudaChannelFormatKindFloat.

       The flags parameter enables different options to be specified that affect the allocation,
       as follows.

       • cudaArrayDefault: This flag's value is defined to be 0 and provides default array
         allocation

       • cudaArraySurfaceLoadStore: Allocates an array that can be read from or written to using
         a surface reference

       • cudaArrayTextureGather: This flag indicates that texture gather operations will be
         performed on the array.

       width and height must meet certain size requirements. See cudaMalloc3DArray() for more
       details.

       Parameters:
           array - Pointer to allocated array in device memory
           desc - Requested channel format
           width - Requested array allocation width
           height - Requested array allocation height
           flags - Requested properties of allocated array

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMalloc, cudaMallocPitch, cudaFree, cudaFreeArray, cudaMallocHost (C API),
           cudaFreeHost, cudaMalloc3D, cudaMalloc3DArray, cudaHostAlloc, cuArrayCreate

   cudaError_t cudaMallocHost (void ** ptr, size_t size)
       Allocates size bytes of host memory that is page-locked and accessible to the device. The
       driver tracks the virtual memory ranges allocated with this function and automatically
       accelerates calls to functions such as cudaMemcpy*(). Since the memory can be accessed
       directly by the device, it can be read or written with much higher bandwidth than pageable
       memory obtained with functions such as malloc(). Allocating excessive amounts of memory
       with cudaMallocHost() may degrade system performance, since it reduces the amount of
       memory available to the system for paging. As a result, this function is best used
       sparingly to allocate staging areas for data exchange between host and device.

       Parameters:
           ptr - Pointer to allocated host memory
           size - Requested allocation size in bytes

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMalloc, cudaMallocPitch, cudaMallocArray, cudaMalloc3D, cudaMalloc3DArray,
           cudaHostAlloc, cudaFree, cudaFreeArray, cudaMallocHost (C++ API), cudaFreeHost,
           cudaHostAlloc, cuMemAllocHost

   __cudart_builtin__ cudaError_t cudaMallocManaged (void ** devPtr, size_t size, unsigned int
       flags = 0x01)
       Allocates size bytes of managed memory on the device and returns in *devPtr a pointer to
       the allocated memory. If the device doesn't support allocating managed memory,
       cudaErrorNotSupported is returned. Support for managed memory can be queried using the
       device attribute cudaDevAttrManagedMemory. The allocated memory is suitably aligned for
       any kind of variable. The memory is not cleared. If size is 0, cudaMallocManaged returns
       cudaErrorInvalidValue. The pointer is valid on the CPU and on all GPUs in the system that
       support managed memory. All accesses to this pointer must obey the Unified Memory
       programming model.

       flags specifies the default stream association for this allocation. flags must be one of
       cudaMemAttachGlobal or cudaMemAttachHost. The default value for flags is
       cudaMemAttachGlobal. If cudaMemAttachGlobal is specified, then this memory is accessible
       from any stream on any device. If cudaMemAttachHost is specified, then the allocation
       should not be accessed from devices that have a zero value for the device attribute
       cudaDevAttrConcurrentManagedAccess; an explicit call to cudaStreamAttachMemAsync will be
       required to enable access on such devices.

       If the association is later changed via cudaStreamAttachMemAsync to a single stream, the
       default association, as specified during cudaMallocManaged, is restored when that stream
       is destroyed. For __managed__ variables, the default association is always
       cudaMemAttachGlobal. Note that destroying a stream is an asynchronous operation, and as a
       result, the change to default association won't happen until all work in the stream has
       completed.

       Memory allocated with cudaMallocManaged should be released with cudaFree.

       Device memory oversubscription is possible for GPUs that have a non-zero value for the
       device attribute cudaDevAttrConcurrentManagedAccess. Managed memory on such GPUs may be
       evicted from device memory to host memory at any time by the Unified Memory driver in
       order to make room for other allocations.

       In a multi-GPU system where all GPUs have a non-zero value for the device attribute
       cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this API
       returns and instead may be populated on access. In such systems, managed memory can
       migrate to any processor's memory at any time. The Unified Memory driver will employ
       heuristics to maintain data locality and prevent excessive page faults to the extent
       possible. The application can also guide the driver about memory usage patterns via
       cudaMemAdvise. The application can also explicitly migrate memory to a desired processor's
       memory via cudaMemPrefetchAsync.

       In a multi-GPU system where all of the GPUs have a zero value for the device attribute
       cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support with each
       other, the physical storage for managed memory is created on the GPU which is active at
       the time cudaMallocManaged is called. All other GPUs will reference the data at reduced
       bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
       memory among such GPUs.

       In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
       where the value of the device attribute cudaDevAttrConcurrentManagedAccess is zero for at
       least one of those GPUs, the location chosen for physical storage of managed memory is
       system-dependent.

       • On Linux, the location chosen will be device memory as long as the current set of active
         contexts are on devices that either have peer-to-peer support with each other or have a
         non-zero value for the device attribute cudaDevAttrConcurrentManagedAccess. If there is
         an active context on a GPU that does not have a non-zero value for that device attribute
         and it does not have peer-to-peer support with the other devices that have active
         contexts on them, then the location for physical storage will be 'zero-copy' or host
         memory. Note that this means that managed memory that is located in device memory is
         migrated to host memory if a new context is created on a GPU that doesn't have a non-
         zero value for the device attribute and does not support peer-to-peer with at least one
         of the other devices that has an active context. This in turn implies that context
         creation may fail if there is insufficient host memory to migrate all managed
         allocations.

       • On Windows, the physical storage is always created in 'zero-copy' or host memory. All
         GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
         circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
         restrict CUDA to only use those GPUs that have peer-to-peer support. Alternatively,
         users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero value to force the
         driver to always use device memory for physical storage. When this environment variable
         is set to a non-zero value, all devices used in that process that support managed memory
         have to be peer-to-peer compatible with each other. The error cudaErrorInvalidDevice
         will be returned if a device that supports managed memory is used and it is not peer-to-
         peer compatible with any of the other managed memory supporting devices that were
         previously used in that process, even if cudaDeviceReset has been called on those
         devices. These environment variables are described in the CUDA programming guide under
         the 'CUDA environment variables' section.

       Parameters:
           devPtr - Pointer to allocated device memory
           size - Requested allocation size in bytes
           flags - Must be either cudaMemAttachGlobal or cudaMemAttachHost (defaults to
           cudaMemAttachGlobal)

       Returns:
           cudaSuccess, cudaErrorMemoryAllocation, cudaErrorNotSupported, cudaErrorInvalidValue

       See also:
           cudaMallocPitch, cudaFree, cudaMallocArray, cudaFreeArray, cudaMalloc3D,
           cudaMalloc3DArray, cudaMallocHost (C API), cudaFreeHost, cudaHostAlloc,
           cudaDeviceGetAttribute, cudaStreamAttachMemAsync, cuMemAllocManaged

       Parameters:
           flags Memory can be accessed by any stream on any device

   cudaError_t cudaMallocMipmappedArray (cudaMipmappedArray_t * mipmappedArray, const struct
       cudaChannelFormatDesc * desc, struct cudaExtent extent, unsigned int numLevels, unsigned
       int flags = 0)
       Allocates a CUDA mipmapped array according to the cudaChannelFormatDesc structure desc and
       returns a handle to the new CUDA mipmapped array in *mipmappedArray. numLevels specifies
       the number of mipmap levels to be allocated. This value is clamped to the range [1, 1 +
       floor(log2(max(width, height, depth)))].

       The cudaChannelFormatDesc is defined as:

           struct cudaChannelFormatDesc {
               int x, y, z, w;
               enum cudaChannelFormatKind f;
           };

        where cudaChannelFormatKind is one of cudaChannelFormatKindSigned,
       cudaChannelFormatKindUnsigned, or cudaChannelFormatKindFloat.

       cudaMallocMipmappedArray() can allocate the following:

       • A 1D mipmapped array is allocated if the height and depth extents are both zero.

       • A 2D mipmapped array is allocated if only the depth extent is zero.

       • A 3D mipmapped array is allocated if all three extents are non-zero.

       • A 1D layered CUDA mipmapped array is allocated if only the height extent is zero and the
         cudaArrayLayered flag is set. Each layer is a 1D mipmapped array. The number of layers
         is determined by the depth extent.

       • A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and the
         cudaArrayLayered flag is set. Each layer is a 2D mipmapped array. The number of layers
         is determined by the depth extent.

       • A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
         cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. The
         order of the six layers in memory is the same as that listed in cudaGraphicsCubeFace.

       • A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero,
         and both, cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to
         height, and depth must be a multiple of six. A cubemap layered CUDA mipmapped array is a
         special type of 2D layered CUDA mipmapped array that consists of a collection of cubemap
         mipmapped arrays. The first six layers represent the first cubemap mipmapped array, the
         next six layers form the second cubemap mipmapped array, and so on.

       The flags parameter enables different options to be specified that affect the allocation,
       as follows.

       • cudaArrayDefault: This flag's value is defined to be 0 and provides default mipmapped
         array allocation

       • cudaArrayLayered: Allocates a layered CUDA mipmapped array, with the depth extent
         indicating the number of layers

       • cudaArrayCubemap: Allocates a cubemap CUDA mipmapped array. Width must be equal to
         height, and depth must be six. If the cudaArrayLayered flag is also set, depth must be a
         multiple of six.

       • cudaArraySurfaceLoadStore: This flag indicates that individual mipmap levels of the CUDA
         mipmapped array will be read from or written to using a surface reference.

       • cudaArrayTextureGather: This flag indicates that texture gather operations will be
         performed on the CUDA array. Texture gather can only be performed on 2D CUDA mipmapped
         arrays, and the gather operations are performed only on the most detailed mipmap level.

       The width, height and depth extents must meet certain size requirements as listed in the
       following table. All values are specified in elements.

       Parameters:
           mipmappedArray - Pointer to allocated mipmapped array in device memory
           desc - Requested channel format
           extent - Requested allocation size (width field in elements)
           numLevels - Number of mipmap levels to allocate
           flags - Flags for extensions

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMalloc3D, cudaMalloc, cudaMallocPitch, cudaFree, cudaFreeArray, cudaMallocHost (C
           API), cudaFreeHost, cudaHostAlloc, make_cudaExtent, cuMipmappedArrayCreate

   cudaError_t cudaMallocPitch (void ** devPtr, size_t * pitch, size_t width, size_t height)
       Allocates at least width (in bytes) * height bytes of linear memory on the device and
       returns in *devPtr a pointer to the allocated memory. The function may pad the allocation
       to ensure that corresponding pointers in any given row will continue to meet the alignment
       requirements for coalescing as the address is updated from row to row. The pitch returned
       in *pitch by cudaMallocPitch() is the width in bytes of the allocation. The intended usage
       of pitch is as a separate parameter of the allocation, used to compute addresses within
       the 2D array. Given the row and column of an array element of type T, the address is
       computed as:

           T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;

       For allocations of 2D arrays, it is recommended that programmers consider performing pitch
       allocations using cudaMallocPitch(). Due to pitch alignment restrictions in the hardware,
       this is especially true if the application will be performing 2D memory copies between
       different regions of device memory (whether linear memory or CUDA arrays).

       Parameters:
           devPtr - Pointer to allocated pitched device memory
           pitch - Pitch for allocation
           width - Requested pitched allocation width (in bytes)
           height - Requested pitched allocation height

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorMemoryAllocation

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMalloc, cudaFree, cudaMallocArray, cudaFreeArray, cudaMallocHost (C API),
           cudaFreeHost, cudaMalloc3D, cudaMalloc3DArray, cudaHostAlloc, cuMemAllocPitch

   cudaError_t cudaMemAdvise (const void * devPtr, size_t count, enum cudaMemoryAdvise advice,
       int device)
       Advise the Unified Memory subsystem about the usage pattern for the memory range starting
       at devPtr with a size of count bytes. The start address and end address of the memory
       range will be rounded down and rounded up respectively to be aligned to CPU page size
       before the advice is applied. The memory range must refer to managed memory allocated via
       cudaMallocManaged or declared via __managed__ variables.

       The advice parameter can take the following values:

       • cudaMemAdviseSetReadMostly: This implies that the data is mostly going to be read from
         and only occasionally written to. Any read accesses from any processor to this region
         will create a read-only copy of at least the accessed pages in that processor's memory.
         Additionally, if cudaMemPrefetchAsync is called on this region, it will create a read-
         only copy of the data on the destination processor. If any processor writes to this
         region, all copies of the corresponding page will be invalidated except for the one
         where the write occurred. The device argument is ignored for this advice. Note that for
         a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
         that has a non-zero value for the device attribute cudaDevAttrConcurrentManagedAccess.
         Also, if a context is created on a device that does not have the device attribute
         cudaDevAttrConcurrentManagedAccess set, then read-duplication will not occur until all
         such contexts are destroyed.

       • cudaMemAdviceUnsetReadMostly: Undoes the effect of cudaMemAdviceReadMostly and also
         prevents the Unified Memory driver from attempting heuristic read-duplication on the
         memory range. Any read-duplicated copies of the data will be collapsed into a single
         copy. The location for the collapsed copy will be the preferred location if the page has
         a preferred location and one of the read-duplicated copies was resident at that
         location. Otherwise, the location chosen is arbitrary.

       • cudaMemAdviseSetPreferredLocation: This advice sets the preferred location for the data
         to be the memory belonging to device. Passing in cudaCpuDeviceId for device sets the
         preferred location as host memory. If device is a GPU, then it must have a non-zero
         value for the device attribute cudaDevAttrConcurrentManagedAccess. Setting the preferred
         location does not cause data to migrate to that location immediately. Instead, it guides
         the migration policy when a fault occurs on that memory region. If the data is already
         in its preferred location and the faulting processor can establish a mapping without
         requiring the data to be migrated, then data migration will be avoided. On the other
         hand, if the data is not in its preferred location or if a direct mapping cannot be
         established, then it will be migrated to the processor accessing it. It is important to
         note that setting the preferred location does not prevent data prefetching done using
         cudaMemPrefetchAsync. Having a preferred location can override the page thrash detection
         and resolution logic in the Unified Memory driver. Normally, if a page is detected to be
         constantly thrashing between for example host and device memory, the page may eventually
         be pinned to host memory by the Unified Memory driver. But if the preferred location is
         set as device memory, then the page will continue to thrash indefinitely. If
         cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then
         the policies associated with that advice will override the policies of this advice.

       • cudaMemAdviseUnsetPreferredLocation: Undoes the effect of
         cudaMemAdviseSetPreferredLocation and changes the preferred location to none.

       • cudaMemAdviseSetAccessedBy: This advice implies that the data will be accessed by
         device. Passing in cudaCpuDeviceId for device will set the advice for the CPU. If device
         is a GPU, then the device attribute cudaDevAttrConcurrentManagedAccess must be non-zero.
         This advice does not cause data migration and has no impact on the location of the data
         per se. Instead, it causes the data to always be mapped in the specified processor's
         page tables, as long as the location of the data permits a mapping to be established. If
         the data gets migrated for any reason, the mappings are updated accordingly. This advice
         is recommended in scenarios where data locality is not important, but avoiding faults
         is. Consider for example a system containing multiple GPUs with peer-to-peer access
         enabled, where the data located on one GPU is occasionally accessed by peer GPUs. In
         such scenarios, migrating data over to the other GPUs is not as important because the
         accesses are infrequent and the overhead of migration may be too high. But preventing
         faults can still help improve performance, and so having a mapping set up in advance is
         useful. Note that on CPU access of this data, the data may be migrated to host memory
         because the CPU typically cannot access device memory directly. Any GPU that had the
         cudaMemAdviceSetAccessedBy flag set for this data will now have its mapping updated to
         point to the page in host memory. If cudaMemAdviseSetReadMostly is also set on this
         memory region or any subset of it, then the policies associated with that advice will
         override the policies of this advice. Additionally, if the preferred location of this
         memory region or any subset of it is also device, then the policies associated with
         cudaMemAdviseSetPreferredLocation will override the policies of this advice.

       • cudaMemAdviseUnsetAccessedBy: Undoes the effect of cudaMemAdviseSetAccessedBy. Any
         mappings to the data from device may be removed at any time causing accesses to result
         in non-fatal page faults.

       Parameters:
           devPtr - Pointer to memory to set the advice for
           count - Size in bytes of the memory range
           advice - Advice to be applied for the specified memory range
           device - Device to apply the advice for

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpyPeer, cudaMemcpyAsync, cudaMemcpy3DPeerAsync,
           cudaMemPrefetchAsync, cuMemAdvise

   cudaError_t cudaMemcpy (void * dst, const void * src, size_t count, enum cudaMemcpyKind kind)
       Copies count bytes from the memory area pointed to by src to the memory area pointed to by
       dst, where kind specifies the direction of the copy, and must be one of
       cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
       cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended,
       in which case the type of transfer is inferred from the pointer values. However,
       cudaMemcpyDefault is only allowed on systems that support unified virtual addressing.
       Calling cudaMemcpy() with dst and src pointers that do not match the direction of the copy
       results in an undefined behavior.

       Parameters:
           dst - Destination memory address
           src - Source memory address
           count - Size in bytes to copy
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync,
           cudaMemcpyToArrayAsync, cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync,
           cudaMemcpy2DFromArrayAsync, cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync,
           cuMemcpyDtoH, cuMemcpyHtoD, cuMemcpyDtoD, cuMemcpy

   cudaError_t cudaMemcpy2D (void * dst, size_t dpitch, const void * src, size_t spitch, size_t
       width, size_t height, enum cudaMemcpyKind kind)
       Copies a matrix (height rows of width bytes each) from the memory area pointed to by src
       to the memory area pointed to by dst, where kind specifies the direction of the copy, and
       must be one of cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
       cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended,
       in which case the type of transfer is inferred from the pointer values. However,
       cudaMemcpyDefault is only allowed on systems that support unified virtual addressing.
       dpitch and spitch are the widths in memory in bytes of the 2D arrays pointed to by dst and
       src, including any padding added to the end of each row. The memory areas may not overlap.
       width must not exceed either dpitch or spitch. Calling cudaMemcpy2D() with dst and src
       pointers that do not match the direction of the copy results in an undefined behavior.
       cudaMemcpy2D() returns an error if dpitch or spitch exceeds the maximum allowed.

       Parameters:
           dst - Destination memory address
           dpitch - Pitch of destination memory
           src - Source memory address
           spitch - Pitch of source memory
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue,
           cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMemcpy, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync,
           cudaMemcpyToArrayAsync, cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync,
           cudaMemcpy2DFromArrayAsync, cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync,
           cuMemcpy2D, cuMemcpy2DUnaligned

   cudaError_t cudaMemcpy2DArrayToArray (cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
       cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height,
       enum cudaMemcpyKind kind = cudaMemcpyDeviceToDevice)
       Copies a matrix (height rows of width bytes each) from the CUDA array srcArray starting at
       the upper left corner (wOffsetSrc, hOffsetSrc) to the CUDA array dst starting at the upper
       left corner (wOffsetDst, hOffsetDst), where kind specifies the direction of the copy, and
       must be one of cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
       cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended,
       in which case the type of transfer is inferred from the pointer values. However,
       cudaMemcpyDefault is only allowed on systems that support unified virtual addressing.
       wOffsetDst + width must not exceed the width of the CUDA array dst. wOffsetSrc + width
       must not exceed the width of the CUDA array src.

       Parameters:
           dst - Destination memory address
           wOffsetDst - Destination starting X offset
           hOffsetDst - Destination starting Y offset
           src - Source memory address
           wOffsetSrc - Source starting X offset
           hOffsetSrc - Source starting Y offset
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpyToSymbol,
           cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync, cudaMemcpyToArrayAsync,
           cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync, cudaMemcpy2DFromArrayAsync,
           cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync, cuMemcpy2D, cuMemcpy2DUnaligned

   __cudart_builtin__ cudaError_t cudaMemcpy2DAsync (void * dst, size_t dpitch, const void * src,
       size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream
       = 0)
       Copies a matrix (height rows of width bytes each) from the memory area pointed to by src
       to the memory area pointed to by dst, where kind specifies the direction of the copy, and
       must be one of cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
       cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended,
       in which case the type of transfer is inferred from the pointer values. However,
       cudaMemcpyDefault is only allowed on systems that support unified virtual addressing.
       dpitch and spitch are the widths in memory in bytes of the 2D arrays pointed to by dst and
       src, including any padding added to the end of each row. The memory areas may not overlap.
       width must not exceed either dpitch or spitch.

       Calling cudaMemcpy2DAsync() with dst and src pointers that do not match the direction of
       the copy results in an undefined behavior. cudaMemcpy2DAsync() returns an error if dpitch
       or spitch is greater than the maximum allowed.

       cudaMemcpy2DAsync() is asynchronous with respect to the host, so the call may return
       before the copy is complete. The copy can optionally be associated to a stream by passing
       a non-zero stream argument. If kind is cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost
       and stream is non-zero, the copy may overlap with operations in other streams.

       The device version of this function only handles device to device copies and cannot be
       given local or shared pointers.

       Parameters:
           dst - Destination memory address
           dpitch - Pitch of destination memory
           src - Source memory address
           spitch - Pitch of source memory
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue,
           cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpyToArrayAsync,
           cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync, cudaMemcpy2DFromArrayAsync,
           cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync, cuMemcpy2DAsync

   cudaError_t cudaMemcpy2DFromArray (void * dst, size_t dpitch, cudaArray_const_t src, size_t
       wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind)
       Copies a matrix (height rows of width bytes each) from the CUDA array srcArray starting at
       the upper left corner (wOffset, hOffset) to the memory area pointed to by dst, where kind
       specifies the direction of the copy, and must be one of cudaMemcpyHostToHost,
       cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or
       cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in which case the type of
       transfer is inferred from the pointer values. However, cudaMemcpyDefault is only allowed
       on systems that support unified virtual addressing. dpitch is the width in memory in bytes
       of the 2D array pointed to by dst, including any padding added to the end of each row.
       wOffset + width must not exceed the width of the CUDA array src. width must not exceed
       dpitch. cudaMemcpy2DFromArray() returns an error if dpitch exceeds the maximum allowed.

       Parameters:
           dst - Destination memory address
           dpitch - Pitch of destination memory
           src - Source memory address
           wOffset - Source starting X offset
           hOffset - Source starting Y offset
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue,
           cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray, cudaMemcpyToSymbol,
           cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync, cudaMemcpyToArrayAsync,
           cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync, cudaMemcpy2DFromArrayAsync,
           cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync, cuMemcpy2D, cuMemcpy2DUnaligned

   cudaError_t cudaMemcpy2DFromArrayAsync (void * dst, size_t dpitch, cudaArray_const_t src,
       size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
       cudaStream_t stream = 0)
       Copies a matrix (height rows of width bytes each) from the CUDA array srcArray starting at
       the upper left corner (wOffset, hOffset) to the memory area pointed to by dst, where kind
       specifies the direction of the copy, and must be one of cudaMemcpyHostToHost,
       cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or
       cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in which case the type of
       transfer is inferred from the pointer values. However, cudaMemcpyDefault is only allowed
       on systems that support unified virtual addressing. dpitch is the width in memory in bytes
       of the 2D array pointed to by dst, including any padding added to the end of each row.
       wOffset + width must not exceed the width of the CUDA array src. width must not exceed
       dpitch. cudaMemcpy2DFromArrayAsync() returns an error if dpitch exceeds the maximum
       allowed.

       cudaMemcpy2DFromArrayAsync() is asynchronous with respect to the host, so the call may
       return before the copy is complete. The copy can optionally be associated to a stream by
       passing a non-zero stream argument. If kind is cudaMemcpyHostToDevice or
       cudaMemcpyDeviceToHost and stream is non-zero, the copy may overlap with operations in
       other streams.

       Parameters:
           dst - Destination memory address
           dpitch - Pitch of destination memory
           src - Source memory address
           wOffset - Source starting X offset
           hOffset - Source starting Y offset
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue,
           cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync,
           cudaMemcpyToArrayAsync, cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync,
           cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync, cuMemcpy2DAsync

   cudaError_t cudaMemcpy2DToArray (cudaArray_t dst, size_t wOffset, size_t hOffset, const void *
       src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind)
       Copies a matrix (height rows of width bytes each) from the memory area pointed to by src
       to the CUDA array dst starting at the upper left corner (wOffset, hOffset) where kind
       specifies the direction of the copy, and must be one of cudaMemcpyHostToHost,
       cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or
       cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in which case the type of
       transfer is inferred from the pointer values. However, cudaMemcpyDefault is only allowed
       on systems that support unified virtual addressing. spitch is the width in memory in bytes
       of the 2D array pointed to by src, including any padding added to the end of each row.
       wOffset + width must not exceed the width of the CUDA array dst. width must not exceed
       spitch. cudaMemcpy2DToArray() returns an error if spitch exceeds the maximum allowed.

       Parameters:
           dst - Destination memory address
           wOffset - Destination starting X offset
           hOffset - Destination starting Y offset
           src - Source memory address
           spitch - Pitch of source memory
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue,
           cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync,
           cudaMemcpyToArrayAsync, cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync,
           cudaMemcpy2DFromArrayAsync, cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync,
           cuMemcpy2D, cuMemcpy2DUnaligned

   cudaError_t cudaMemcpy2DToArrayAsync (cudaArray_t dst, size_t wOffset, size_t hOffset, const
       void * src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
       cudaStream_t stream = 0)
       Copies a matrix (height rows of width bytes each) from the memory area pointed to by src
       to the CUDA array dst starting at the upper left corner (wOffset, hOffset) where kind
       specifies the direction of the copy, and must be one of cudaMemcpyHostToHost,
       cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or
       cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in which case the type of
       transfer is inferred from the pointer values. However, cudaMemcpyDefault is only allowed
       on systems that support unified virtual addressing. spitch is the width in memory in bytes
       of the 2D array pointed to by src, including any padding added to the end of each row.
       wOffset + width must not exceed the width of the CUDA array dst. width must not exceed
       spitch. cudaMemcpy2DToArrayAsync() returns an error if spitch exceeds the maximum allowed.

       cudaMemcpy2DToArrayAsync() is asynchronous with respect to the host, so the call may
       return before the copy is complete. The copy can optionally be associated to a stream by
       passing a non-zero stream argument. If kind is cudaMemcpyHostToDevice or
       cudaMemcpyDeviceToHost and stream is non-zero, the copy may overlap with operations in
       other streams.

       Parameters:
           dst - Destination memory address
           wOffset - Destination starting X offset
           hOffset - Destination starting Y offset
           src - Source memory address
           spitch - Pitch of source memory
           width - Width of matrix transfer (columns in bytes)
           height - Height of matrix transfer (rows)
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue,
           cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync,
           cudaMemcpyToArrayAsync, cudaMemcpyFromArrayAsync, cudaMemcpy2DFromArrayAsync,
           cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync, cuMemcpy2DAsync

   cudaError_t cudaMemcpy3D (const struct cudaMemcpy3DParms * p)
       struct cudaExtent {
         size_t width;
         size_t height;
         size_t depth;
       };
       struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);

       struct cudaPos {
         size_t x;
         size_t y;
         size_t z;
       };
       struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);

       struct cudaMemcpy3DParms {
         cudaArray_t           srcArray;
         struct cudaPos        srcPos;
         struct cudaPitchedPtr srcPtr;
         cudaArray_t           dstArray;
         struct cudaPos        dstPos;
         struct cudaPitchedPtr dstPtr;
         struct cudaExtent     extent;
         enum cudaMemcpyKind   kind;
       };

       cudaMemcpy3D() copies data between two 3D objects. The source and destination objects may
       be in either host memory, device memory, or a CUDA array. The source, destination, extent,
       and kind of copy performed is specified by the cudaMemcpy3DParms struct which should be
       initialized to zero before use:

       cudaMemcpy3DParms myParms = {0};

       The struct passed to cudaMemcpy3D() must specify one of srcArray or srcPtr and one of
       dstArray or dstPtr. Passing more than one non-zero source or destination will cause
       cudaMemcpy3D() to return an error.

       The srcPos and dstPos fields are optional offsets into the source and destination objects
       and are defined in units of each object's elements. The element for a host or device
       pointer is assumed to be unsigned char.

       The extent field defines the dimensions of the transferred area in elements. If a CUDA
       array is participating in the copy, the extent is defined in terms of that array's
       elements. If no CUDA array is participating in the copy then the extents are defined in
       elements of unsigned char.

       The kind field defines the direction of the copy. It must be one of cudaMemcpyHostToHost,
       cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or
       cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in which case the type of
       transfer is inferred from the pointer values. However, cudaMemcpyDefault is only allowed
       on systems that support unified virtual addressing. For cudaMemcpyHostToHost or
       cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost passed as kind and cudaArray type passed
       as source or destination, if the kind implies cudaArray type to be present on the host,
       cudaMemcpy3D() will disregard that implication and silently correct the kind based on the
       fact that cudaArray type can only be present on the device.

       If the source and destination are both arrays, cudaMemcpy3D() will return an error if they
       do not have the same element size.

       The source and destination object may not overlap. If overlapping source and destination
       objects are specified, undefined behavior will result.

       The source object must lie entirely within the region defined by srcPos and extent. The
       destination object must lie entirely within the region defined by dstPos and extent.

       cudaMemcpy3D() returns an error if the pitch of srcPtr or dstPtr exceeds the maximum
       allowed. The pitch of a cudaPitchedPtr allocated with cudaMalloc3D() will always be valid.

       Parameters:
           p - 3D memory copy parameters

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue,
           cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMalloc3D, cudaMalloc3DArray, cudaMemset3D, cudaMemcpy3DAsync, cudaMemcpy,
           cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync,
           cudaMemcpyToArrayAsync, cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync,
           cudaMemcpy2DFromArrayAsync, cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync,
           make_cudaExtent, make_cudaPos, cuMemcpy3D

   __cudart_builtin__ cudaError_t cudaMemcpy3DAsync (const struct cudaMemcpy3DParms * p,
       cudaStream_t stream = 0)
       struct cudaExtent {
         size_t width;
         size_t height;
         size_t depth;
       };
       struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);

       struct cudaPos {
         size_t x;
         size_t y;
         size_t z;
       };
       struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);

       struct cudaMemcpy3DParms {
         cudaArray_t           srcArray;
         struct cudaPos        srcPos;
         struct cudaPitchedPtr srcPtr;
         cudaArray_t           dstArray;
         struct cudaPos        dstPos;
         struct cudaPitchedPtr dstPtr;
         struct cudaExtent     extent;
         enum cudaMemcpyKind   kind;
       };

       cudaMemcpy3DAsync() copies data between two 3D objects. The source and destination objects
       may be in either host memory, device memory, or a CUDA array. The source, destination,
       extent, and kind of copy performed is specified by the cudaMemcpy3DParms struct which
       should be initialized to zero before use:

       cudaMemcpy3DParms myParms = {0};

       The struct passed to cudaMemcpy3DAsync() must specify one of srcArray or srcPtr and one of
       dstArray or dstPtr. Passing more than one non-zero source or destination will cause
       cudaMemcpy3DAsync() to return an error.

       The srcPos and dstPos fields are optional offsets into the source and destination objects
       and are defined in units of each object's elements. The element for a host or device
       pointer is assumed to be unsigned char. For CUDA arrays, positions must be in the range
       [0, 2048) for any dimension.

       The extent field defines the dimensions of the transferred area in elements. If a CUDA
       array is participating in the copy, the extent is defined in terms of that array's
       elements. If no CUDA array is participating in the copy then the extents are defined in
       elements of unsigned char.

       The kind field defines the direction of the copy. It must be one of cudaMemcpyHostToHost,
       cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or
       cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended, in which case the type of
       transfer is inferred from the pointer values. However, cudaMemcpyDefault is only allowed
       on systems that support unified virtual addressing. For cudaMemcpyHostToHost or
       cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost passed as kind and cudaArray type passed
       as source or destination, if the kind implies cudaArray type to be present on the host,
       cudaMemcpy3DAsync() will disregard that implication and silently correct the kind based on
       the fact that cudaArray type can only be present on the device.

       If the source and destination are both arrays, cudaMemcpy3DAsync() will return an error if
       they do not have the same element size.

       The source and destination object may not overlap. If overlapping source and destination
       objects are specified, undefined behavior will result.

       The source object must lie entirely within the region defined by srcPos and extent. The
       destination object must lie entirely within the region defined by dstPos and extent.

       cudaMemcpy3DAsync() returns an error if the pitch of srcPtr or dstPtr exceeds the maximum
       allowed. The pitch of a cudaPitchedPtr allocated with cudaMalloc3D() will always be valid.

       cudaMemcpy3DAsync() is asynchronous with respect to the host, so the call may return
       before the copy is complete. The copy can optionally be associated to a stream by passing
       a non-zero stream argument. If kind is cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost
       and stream is non-zero, the copy may overlap with operations in other streams.

       The device version of this function only handles device to device copies and cannot be
       given local or shared pointers.

       Parameters:
           p - 3D memory copy parameters
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidPitchValue,
           cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMalloc3D, cudaMalloc3DArray, cudaMemset3D, cudaMemcpy3D, cudaMemcpy, cudaMemcpy2D,
           cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray, cudaMemcpy2DFromArray,
           cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray, cudaMemcpyToSymbol,
           cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync, cudaMemcpyToArrayAsync,
           cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync, cudaMemcpy2DFromArrayAsync,
           cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync, make_cudaExtent, make_cudaPos,
           cuMemcpy3DAsync

   cudaError_t cudaMemcpy3DPeer (const struct cudaMemcpy3DPeerParms * p)
       Perform a 3D memory copy according to the parameters specified in p. See the definition of
       the cudaMemcpy3DPeerParms structure for documentation of its parameters.

       Note that this function is synchronous with respect to the host only if the source or
       destination of the transfer is host memory. Note also that this copy is serialized with
       respect to all pending and future asynchronous work in to the current device, the copy's
       source device, and the copy's destination device (use cudaMemcpy3DPeerAsync to avoid this
       synchronization).

       Parameters:
           p - Parameters for the memory copy

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy, cudaMemcpyPeer, cudaMemcpyAsync, cudaMemcpyPeerAsync,
           cudaMemcpy3DPeerAsync, cuMemcpy3DPeer

   cudaError_t cudaMemcpy3DPeerAsync (const struct cudaMemcpy3DPeerParms * p, cudaStream_t stream
       = 0)
       Perform a 3D memory copy according to the parameters specified in p. See the definition of
       the cudaMemcpy3DPeerParms structure for documentation of its parameters.

       Parameters:
           p - Parameters for the memory copy
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpyPeer, cudaMemcpyAsync, cudaMemcpyPeerAsync,
           cudaMemcpy3DPeerAsync, cuMemcpy3DPeerAsync

   cudaError_t cudaMemcpyArrayToArray (cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
       cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum
       cudaMemcpyKind kind = cudaMemcpyDeviceToDevice)
       Copies count bytes from the CUDA array src starting at the upper left corner (wOffsetSrc,
       hOffsetSrc) to the CUDA array dst starting at the upper left corner (wOffsetDst,
       hOffsetDst) where kind specifies the direction of the copy, and must be one of
       cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
       cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended,
       in which case the type of transfer is inferred from the pointer values. However,
       cudaMemcpyDefault is only allowed on systems that support unified virtual addressing.

       Parameters:
           dst - Destination memory address
           wOffsetDst - Destination starting X offset
           hOffsetDst - Destination starting Y offset
           src - Source memory address
           wOffsetSrc - Source starting X offset
           hOffsetSrc - Source starting Y offset
           count - Size in bytes to copy
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpy2DArrayToArray, cudaMemcpyToSymbol,
           cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync, cudaMemcpyToArrayAsync,
           cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync, cudaMemcpy2DFromArrayAsync,
           cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync, cuMemcpyAtoA

   __cudart_builtin__ cudaError_t cudaMemcpyAsync (void * dst, const void * src, size_t count,
       enum cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies count bytes from the memory area pointed to by src to the memory area pointed to by
       dst, where kind specifies the direction of the copy, and must be one of
       cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
       cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing cudaMemcpyDefault is recommended,
       in which case the type of transfer is inferred from the pointer values. However,
       cudaMemcpyDefault is only allowed on systems that support unified virtual addressing.

       The memory areas may not overlap. Calling cudaMemcpyAsync() with dst and src pointers that
       do not match the direction of the copy results in an undefined behavior.

       cudaMemcpyAsync() is asynchronous with respect to the host, so the call may return before
       the copy is complete. The copy can optionally be associated to a stream by passing a non-
       zero stream argument. If kind is cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost and the
       stream is non-zero, the copy may overlap with operations in other streams.

       The device version of this function only handles device to device copies and cannot be
       given local or shared pointers.

       Parameters:
           dst - Destination memory address
           src - Source memory address
           count - Size in bytes to copy
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpy2DAsync, cudaMemcpyToArrayAsync,
           cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync, cudaMemcpy2DFromArrayAsync,
           cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync cuMemcpyAsync, cuMemcpyDtoHAsync,
           cuMemcpyHtoDAsync, cuMemcpyDtoDAsync

   cudaError_t cudaMemcpyFromArray (void * dst, cudaArray_const_t src, size_t wOffset, size_t
       hOffset, size_t count, enum cudaMemcpyKind kind)
       Copies count bytes from the CUDA array src starting at the upper left corner (wOffset,
       hOffset) to the memory area pointed to by dst, where kind specifies the direction of the
       copy, and must be one of cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
       cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the
       pointer values. However, cudaMemcpyDefault is only allowed on systems that support unified
       virtual addressing.

       Parameters:
           dst - Destination memory address
           src - Source memory address
           wOffset - Source starting X offset
           hOffset - Source starting Y offset
           count - Size in bytes to copy
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync,
           cudaMemcpyToArrayAsync, cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync,
           cudaMemcpy2DFromArrayAsync, cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync,
           cuMemcpyAtoH, cuMemcpyAtoD

   cudaError_t cudaMemcpyFromArrayAsync (void * dst, cudaArray_const_t src, size_t wOffset,
       size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies count bytes from the CUDA array src starting at the upper left corner (wOffset,
       hOffset) to the memory area pointed to by dst, where kind specifies the direction of the
       copy, and must be one of cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
       cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the
       pointer values. However, cudaMemcpyDefault is only allowed on systems that support unified
       virtual addressing.

       cudaMemcpyFromArrayAsync() is asynchronous with respect to the host, so the call may
       return before the copy is complete. The copy can optionally be associated to a stream by
       passing a non-zero stream argument. If kind is cudaMemcpyHostToDevice or
       cudaMemcpyDeviceToHost and stream is non-zero, the copy may overlap with operations in
       other streams.

       Parameters:
           dst - Destination memory address
           src - Source memory address
           wOffset - Source starting X offset
           hOffset - Source starting Y offset
           count - Size in bytes to copy
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync,
           cudaMemcpyToArrayAsync, cudaMemcpy2DToArrayAsync, cudaMemcpy2DFromArrayAsync,
           cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync, cuMemcpyAtoHAsync, cuMemcpy2DAsync

   cudaError_t cudaMemcpyFromSymbol (void * dst, const void * symbol, size_t count, size_t offset
       = 0, enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost)
       Copies count bytes from the memory area pointed to by offset bytes from the start of
       symbol symbol to the memory area pointed to by dst. The memory areas may not overlap.
       symbol is a variable that resides in global or constant memory space. kind can be either
       cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the
       pointer values. However, cudaMemcpyDefault is only allowed on systems that support unified
       virtual addressing.

       Parameters:
           dst - Destination memory address
           symbol - Device symbol address
           count - Size in bytes to copy
           offset - Offset from start of symbol in bytes
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidSymbol,
           cudaErrorInvalidMemcpyDirection, cudaErrorNoKernelImageForDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           Use of a string naming a variable as the symbol parameter was deprecated in CUDA 4.1
           and removed in CUDA 5.0.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync, cudaMemcpyToArrayAsync,
           cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync, cudaMemcpy2DFromArrayAsync,
           cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync, cuMemcpy, cuMemcpyDtoH,
           cuMemcpyDtoD

   cudaError_t cudaMemcpyFromSymbolAsync (void * dst, const void * symbol, size_t count, size_t
       offset, enum cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies count bytes from the memory area pointed to by offset bytes from the start of
       symbol symbol to the memory area pointed to by dst. The memory areas may not overlap.
       symbol is a variable that resides in global or constant memory space. kind can be either
       cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the
       pointer values. However, cudaMemcpyDefault is only allowed on systems that support unified
       virtual addressing.

       cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so the call may
       return before the copy is complete. The copy can optionally be associated to a stream by
       passing a non-zero stream argument. If kind is cudaMemcpyDeviceToHost and stream is non-
       zero, the copy may overlap with operations in other streams.

       Parameters:
           dst - Destination memory address
           symbol - Device symbol address
           count - Size in bytes to copy
           offset - Offset from start of symbol in bytes
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidSymbol,
           cudaErrorInvalidMemcpyDirection, cudaErrorNoKernelImageForDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

           Use of a string naming a variable as the symbol parameter was deprecated in CUDA 4.1
           and removed in CUDA 5.0.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync,
           cudaMemcpyToArrayAsync, cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync,
           cudaMemcpy2DFromArrayAsync, cudaMemcpyToSymbolAsync, cuMemcpyAsync, cuMemcpyDtoHAsync,
           cuMemcpyDtoDAsync

   cudaError_t cudaMemcpyPeer (void * dst, int dstDevice, const void * src, int srcDevice, size_t
       count)
       Copies memory from one device to memory on another device. dst is the base device pointer
       of the destination memory and dstDevice is the destination device. src is the base device
       pointer of the source memory and srcDevice is the source device. count specifies the
       number of bytes to copy.

       Note that this function is asynchronous with respect to the host, but serialized with
       respect all pending and future asynchronous work in to the current device, srcDevice, and
       dstDevice (use cudaMemcpyPeerAsync to avoid this synchronization).

       Parameters:
           dst - Destination device pointer
           dstDevice - Destination device
           src - Source device pointer
           srcDevice - Source device
           count - Size of memory copy in bytes

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy, cudaMemcpyAsync, cudaMemcpyPeerAsync, cudaMemcpy3DPeerAsync, cuMemcpyPeer

   cudaError_t cudaMemcpyPeerAsync (void * dst, int dstDevice, const void * src, int srcDevice,
       size_t count, cudaStream_t stream = 0)
       Copies memory from one device to memory on another device. dst is the base device pointer
       of the destination memory and dstDevice is the destination device. src is the base device
       pointer of the source memory and srcDevice is the source device. count specifies the
       number of bytes to copy.

       Note that this function is asynchronous with respect to the host and all work on other
       devices.

       Parameters:
           dst - Destination device pointer
           dstDevice - Destination device
           src - Source device pointer
           srcDevice - Source device
           count - Size of memory copy in bytes
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpyPeer, cudaMemcpyAsync, cudaMemcpy3DPeerAsync, cuMemcpyPeerAsync

   cudaError_t cudaMemcpyToArray (cudaArray_t dst, size_t wOffset, size_t hOffset, const void *
       src, size_t count, enum cudaMemcpyKind kind)
       Copies count bytes from the memory area pointed to by src to the CUDA array dst starting
       at the upper left corner (wOffset, hOffset), where kind specifies the direction of the
       copy, and must be one of cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
       cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the
       pointer values. However, cudaMemcpyDefault is only allowed on systems that support unified
       virtual addressing.

       Parameters:
           dst - Destination memory address
           wOffset - Destination starting X offset
           hOffset - Destination starting Y offset
           src - Source memory address
           count - Size in bytes to copy
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync,
           cudaMemcpyToArrayAsync, cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync,
           cudaMemcpy2DFromArrayAsync, cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync,
           cuMemcpyHtoA, cuMemcpyDtoA

   cudaError_t cudaMemcpyToArrayAsync (cudaArray_t dst, size_t wOffset, size_t hOffset, const
       void * src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies count bytes from the memory area pointed to by src to the CUDA array dst starting
       at the upper left corner (wOffset, hOffset), where kind specifies the direction of the
       copy, and must be one of cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
       cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the
       pointer values. However, cudaMemcpyDefault is only allowed on systems that support unified
       virtual addressing.

       cudaMemcpyToArrayAsync() is asynchronous with respect to the host, so the call may return
       before the copy is complete. The copy can optionally be associated to a stream by passing
       a non-zero stream argument. If kind is cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost
       and stream is non-zero, the copy may overlap with operations in other streams.

       Parameters:
           dst - Destination memory address
           wOffset - Destination starting X offset
           hOffset - Destination starting Y offset
           src - Source memory address
           count - Size in bytes to copy
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidMemcpyDirection

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync,
           cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync, cudaMemcpy2DFromArrayAsync,
           cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync, cuMemcpyHtoAAsync, cuMemcpy2DAsync

   cudaError_t cudaMemcpyToSymbol (const void * symbol, const void * src, size_t count, size_t
       offset = 0, enum cudaMemcpyKind kind = cudaMemcpyHostToDevice)
       Copies count bytes from the memory area pointed to by src to the memory area pointed to by
       offset bytes from the start of symbol symbol. The memory areas may not overlap. symbol is
       a variable that resides in global or constant memory space. kind can be either
       cudaMemcpyHostToDevice, cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the
       pointer values. However, cudaMemcpyDefault is only allowed on systems that support unified
       virtual addressing.

       Parameters:
           symbol - Device symbol address
           src - Source memory address
           count - Size in bytes to copy
           offset - Offset from start of symbol in bytes
           kind - Type of transfer

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidSymbol,
           cudaErrorInvalidMemcpyDirection, cudaErrorNoKernelImageForDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           Use of a string naming a variable as the symbol parameter was deprecated in CUDA 4.1
           and removed in CUDA 5.0.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync, cudaMemcpyToArrayAsync,
           cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync, cudaMemcpy2DFromArrayAsync,
           cudaMemcpyToSymbolAsync, cudaMemcpyFromSymbolAsync, cuMemcpy, cuMemcpyHtoD,
           cuMemcpyDtoD

   cudaError_t cudaMemcpyToSymbolAsync (const void * symbol, const void * src, size_t count,
       size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream = 0)
       Copies count bytes from the memory area pointed to by src to the memory area pointed to by
       offset bytes from the start of symbol symbol. The memory areas may not overlap. symbol is
       a variable that resides in global or constant memory space. kind can be either
       cudaMemcpyHostToDevice, cudaMemcpyDeviceToDevice, or cudaMemcpyDefault. Passing
       cudaMemcpyDefault is recommended, in which case the type of transfer is inferred from the
       pointer values. However, cudaMemcpyDefault is only allowed on systems that support unified
       virtual addressing.

       cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so the call may return
       before the copy is complete. The copy can optionally be associated to a stream by passing
       a non-zero stream argument. If kind is cudaMemcpyHostToDevice and stream is non-zero, the
       copy may overlap with operations in other streams.

       Parameters:
           symbol - Device symbol address
           src - Source memory address
           count - Size in bytes to copy
           offset - Offset from start of symbol in bytes
           kind - Type of transfer
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidSymbol,
           cudaErrorInvalidMemcpyDirection, cudaErrorNoKernelImageForDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

           Use of a string naming a variable as the symbol parameter was deprecated in CUDA 4.1
           and removed in CUDA 5.0.

       See also:
           cudaMemcpy, cudaMemcpy2D, cudaMemcpyToArray, cudaMemcpy2DToArray, cudaMemcpyFromArray,
           cudaMemcpy2DFromArray, cudaMemcpyArrayToArray, cudaMemcpy2DArrayToArray,
           cudaMemcpyToSymbol, cudaMemcpyFromSymbol, cudaMemcpyAsync, cudaMemcpy2DAsync,
           cudaMemcpyToArrayAsync, cudaMemcpy2DToArrayAsync, cudaMemcpyFromArrayAsync,
           cudaMemcpy2DFromArrayAsync, cudaMemcpyFromSymbolAsync, cuMemcpyAsync,
           cuMemcpyHtoDAsync, cuMemcpyDtoDAsync

   cudaError_t cudaMemGetInfo (size_t * free, size_t * total)
       Returns in *free and *total respectively, the free and total amount of memory available
       for allocation by the device in bytes.

       Parameters:
           free - Returned free memory in bytes
           total - Returned total memory in bytes

       Returns:
           cudaSuccess, cudaErrorInitializationError, cudaErrorInvalidValue,
           cudaErrorLaunchFailure

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cuMemGetInfo

   cudaError_t cudaMemPrefetchAsync (const void * devPtr, size_t count, int dstDevice,
       cudaStream_t stream = 0)
       Prefetches memory to the specified destination device. devPtr is the base device pointer
       of the memory to be prefetched and dstDevice is the destination device. count specifies
       the number of bytes to copy. stream is the stream in which the operation is enqueued. The
       memory range must refer to managed memory allocated via cudaMallocManaged or declared via
       __managed__ variables.

       Passing in cudaCpuDeviceId for dstDevice will prefetch the data to host memory. If
       dstDevice is a GPU, then the device attribute cudaDevAttrConcurrentManagedAccess must be
       non-zero. Additionally, stream must be associated with a device that has a non-zero value
       for the device attribute cudaDevAttrConcurrentManagedAccess.

       The start address and end address of the memory range will be rounded down and rounded up
       respectively to be aligned to CPU page size before the prefetch operation is enqueued in
       the stream.

       If no physical memory has been allocated for this region, then this memory region will be
       populated and mapped on the destination device. If there's insufficient memory to prefetch
       the desired region, the Unified Memory driver may evict pages from other cudaMallocManaged
       allocations to host memory in order to make room. Device memory allocated using cudaMalloc
       or cudaMallocArray will not be evicted.

       By default, any mappings to the previous location of the migrated pages are removed and
       mappings for the new location are only setup on dstDevice. The exact behavior however also
       depends on the settings applied to this memory range via cudaMemAdvise as described below:

       If cudaMemAdviseSetReadMostly was set on any subset of this memory range, then that subset
       will create a read-only copy of the pages on dstDevice.

       If cudaMemAdviseSetPreferredLocation was called on any subset of this memory range, then
       the pages will be migrated to dstDevice even if dstDevice is not the preferred location of
       any pages in the memory range.

       If cudaMemAdviseSetAccessedBy was called on any subset of this memory range, then mappings
       to those pages from all the appropriate processors are updated to refer to the new
       location if establishing such a mapping is possible. Otherwise, those mappings are
       cleared.

       Note that this API is not required for functionality and only serves to improve
       performance by allowing the application to migrate data to a suitable location before it
       is accessed. Memory accesses to this range are always coherent and are allowed even when
       the data is actively being migrated.

       Note that this function is asynchronous with respect to the host and all work on other
       devices.

       Parameters:
           devPtr - Pointer to be prefetched
           count - Size in bytes
           dstDevice - Destination device to prefetch to
           stream - Stream to enqueue prefetch operation

       Returns:
           cudaSuccess, cudaErrorInvalidValue, cudaErrorInvalidDevice

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemcpy, cudaMemcpyPeer, cudaMemcpyAsync, cudaMemcpy3DPeerAsync, cudaMemAdvise,
           cuMemPrefetchAsync

   cudaError_t cudaMemRangeGetAttribute (void * data, size_t dataSize, enum cudaMemRangeAttribute
       attribute, const void * devPtr, size_t count)
       Query an attribute about the memory range starting at devPtr with a size of count bytes.
       The memory range must refer to managed memory allocated via cudaMallocManaged or declared
       via __managed__ variables.

       The attribute parameter can take the following values:

       • cudaMemRangeAttributeReadMostly: If this attribute is specified, data will be
         interpreted as a 32-bit integer, and dataSize must be 4. The result returned will be 1
         if all pages in the given memory range have read-duplication enabled, or 0 otherwise.

       • cudaMemRangeAttributePreferredLocation: If this attribute is specified, data will be
         interpreted as a 32-bit integer, and dataSize must be 4. The result returned will be a
         GPU device id if all pages in the memory range have that GPU as their preferred
         location, or it will be cudaCpuDeviceId if all pages in the memory range have the CPU as
         their preferred location, or it will be cudaInvalidDeviceId if either all the pages
         don't have the same preferred location or some of the pages don't have a preferred
         location at all. Note that the actual location of the pages in the memory range at the
         time of the query may be different from the preferred location.

       • cudaMemRangeAttributeAccessedBy: If this attribute is specified, data will be
         interpreted as an array of 32-bit integers, and dataSize must be a non-zero multiple of
         4. The result returned will be a list of device ids that had cudaMemAdviceSetAccessedBy
         set for that entire memory range. If any device does not have that advice set for the
         entire memory range, that device will not be included. If data is larger than the number
         of devices that have that advice set for that memory range, cudaInvalidDeviceId will be
         returned in all the extra space provided. For ex., if dataSize is 12 (i.e. data has 3
         elements) and only device 0 has the advice set, then the result returned will be { 0,
         cudaInvalidDeviceId, cudaInvalidDeviceId }. If data is smaller than the number of
         devices that have that advice set, then only as many devices will be returned as can fit
         in the array. There is no guarantee on which specific devices will be returned, however.

       • cudaMemRangeAttributeLastPrefetchLocation: If this attribute is specified, data will be
         interpreted as a 32-bit integer, and dataSize must be 4. The result returned will be the
         last location to which all pages in the memory range were prefetched explicitly via
         cudaMemPrefetchAsync. This will either be a GPU id or cudaCpuDeviceId depending on
         whether the last location for prefetch was a GPU or the CPU respectively. If any page in
         the memory range was never explicitly prefetched or if all pages were not prefetched to
         the same location, cudaInvalidDeviceId will be returned. Note that this simply returns
         the last location that the applicaton requested to prefetch the memory range to. It
         gives no indication as to whether the prefetch operation to that location has completed
         or even begun.

       Parameters:
           data - A pointers to a memory location where the result of each attribute query will
           be written to.
           dataSize - Array containing the size of data
           attribute - The attribute to query
           devPtr - Start of the range to query
           count - Size of the range to query

       Returns:
           cudaSuccess, cudaErrorInvalidValue

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           This function exhibits  behavior for most use cases.

           This function uses standard  semantics.

       See also:
           cudaMemRangeGetAttributes, cudaMemPrefetchAsync, cudaMemAdvise, cuMemRangeGetAttribute

   cudaError_t cudaMemRangeGetAttributes (void ** data, size_t * dataSizes, enum
       cudaMemRangeAttribute * attributes, size_t numAttributes, const void * devPtr, size_t
       count)
       Query attributes of the memory range starting at devPtr with a size of count bytes. The
       memory range must refer to managed memory allocated via cudaMallocManaged or declared via
       __managed__ variables. The attributes array will be interpreted to have numAttributes
       entries. The dataSizes array will also be interpreted to have numAttributes entries. The
       results of the query will be stored in data.

       The list of supported attributes are given below. Please refer to cudaMemRangeGetAttribute
       for attribute descriptions and restrictions.

       • cudaMemRangeAttributeReadMostlycudaMemRangeAttributePreferredLocationcudaMemRangeAttributeAccessedBycudaMemRangeAttributeLastPrefetchLocation

       Parameters:
           data - A two-dimensional array containing pointers to memory locations where the
           result of each attribute query will be written to.
           dataSizes - Array containing the sizes of each result
           attributes - An array of attributes to query (numAttributes and the number of
           attributes in this array should match)
           numAttributes - Number of attributes to query
           devPtr - Start of the range to query
           count - Size of the range to query

       Returns:
           cudaSuccess, cudaErrorInvalidValue

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

       See also:
           cudaMemRangeGetAttribute, cudaMemAdvise cudaMemPrefetchAsync, cuMemRangeGetAttributes

   cudaError_t cudaMemset (void * devPtr, int value, size_t count)
       Fills the first count bytes of the memory area pointed to by devPtr with the constant byte
       value value.

       Note that this function is asynchronous with respect to the host unless devPtr refers to
       pinned host memory.

       Parameters:
           devPtr - Pointer to device memory
           value - Value to set for each byte of specified memory
           count - Size in bytes to set

       Returns:
           cudaSuccess, cudaErrorInvalidValue,

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           See also .

       See also:
           cuMemsetD8, cuMemsetD16, cuMemsetD32

   cudaError_t cudaMemset2D (void * devPtr, size_t pitch, int value, size_t width, size_t height)
       Sets to the specified value value a matrix (height rows of width bytes each) pointed to by
       dstPtr. pitch is the width in bytes of the 2D array pointed to by dstPtr, including any
       padding added to the end of each row. This function performs fastest when the pitch is one
       that has been passed back by cudaMallocPitch().

       Note that this function is asynchronous with respect to the host unless devPtr refers to
       pinned host memory.

       Parameters:
           devPtr - Pointer to 2D device memory
           pitch - Pitch in bytes of 2D device memory
           value - Value to set for each byte of specified memory
           width - Width of matrix set (columns in bytes)
           height - Height of matrix set (rows)

       Returns:
           cudaSuccess, cudaErrorInvalidValue,

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           See also .

       See also:
           cudaMemset, cudaMemset3D, cudaMemsetAsync, cudaMemset2DAsync, cudaMemset3DAsync,
           cuMemsetD2D8, cuMemsetD2D16, cuMemsetD2D32

   __cudart_builtin__ cudaError_t cudaMemset2DAsync (void * devPtr, size_t pitch, int value,
       size_t width, size_t height, cudaStream_t stream = 0)
       Sets to the specified value value a matrix (height rows of width bytes each) pointed to by
       dstPtr. pitch is the width in bytes of the 2D array pointed to by dstPtr, including any
       padding added to the end of each row. This function performs fastest when the pitch is one
       that has been passed back by cudaMallocPitch().

       cudaMemset2DAsync() is asynchronous with respect to the host, so the call may return
       before the memset is complete. The operation can optionally be associated to a stream by
       passing a non-zero stream argument. If stream is non-zero, the operation may overlap with
       operations in other streams.

       The device version of this function only handles device to device copies and cannot be
       given local or shared pointers.

       Parameters:
           devPtr - Pointer to 2D device memory
           pitch - Pitch in bytes of 2D device memory
           value - Value to set for each byte of specified memory
           width - Width of matrix set (columns in bytes)
           height - Height of matrix set (rows)
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue,

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           See also .

           This function uses standard  semantics.

       See also:
           cudaMemset, cudaMemset2D, cudaMemset3D, cudaMemsetAsync, cudaMemset3DAsync,
           cuMemsetD2D8Async, cuMemsetD2D16Async, cuMemsetD2D32Async

   cudaError_t cudaMemset3D (struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent
       extent)
       Initializes each element of a 3D array to the specified value value. The object to
       initialize is defined by pitchedDevPtr. The pitch field of pitchedDevPtr is the width in
       memory in bytes of the 3D array pointed to by pitchedDevPtr, including any padding added
       to the end of each row. The xsize field specifies the logical width of each row in bytes,
       while the ysize field specifies the height of each 2D slice in rows.

       The extents of the initialized region are specified as a width in bytes, a height in rows,
       and a depth in slices.

       Extents with width greater than or equal to the xsize of pitchedDevPtr may perform
       significantly faster than extents narrower than the xsize. Secondarily, extents with
       height equal to the ysize of pitchedDevPtr will perform faster than when the height is
       shorter than the ysize.

       This function performs fastest when the pitchedDevPtr has been allocated by
       cudaMalloc3D().

       Note that this function is asynchronous with respect to the host unless pitchedDevPtr
       refers to pinned host memory.

       Parameters:
           pitchedDevPtr - Pointer to pitched device memory
           value - Value to set for each byte of specified memory
           extent - Size parameters for where to set device memory (width field in bytes)

       Returns:
           cudaSuccess, cudaErrorInvalidValue,

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           See also .

       See also:
           cudaMemset, cudaMemset2D, cudaMemsetAsync, cudaMemset2DAsync, cudaMemset3DAsync,
           cudaMalloc3D, make_cudaPitchedPtr, make_cudaExtent

   __cudart_builtin__ cudaError_t cudaMemset3DAsync (struct cudaPitchedPtr pitchedDevPtr, int
       value, struct cudaExtent extent, cudaStream_t stream = 0)
       Initializes each element of a 3D array to the specified value value. The object to
       initialize is defined by pitchedDevPtr. The pitch field of pitchedDevPtr is the width in
       memory in bytes of the 3D array pointed to by pitchedDevPtr, including any padding added
       to the end of each row. The xsize field specifies the logical width of each row in bytes,
       while the ysize field specifies the height of each 2D slice in rows.

       The extents of the initialized region are specified as a width in bytes, a height in rows,
       and a depth in slices.

       Extents with width greater than or equal to the xsize of pitchedDevPtr may perform
       significantly faster than extents narrower than the xsize. Secondarily, extents with
       height equal to the ysize of pitchedDevPtr will perform faster than when the height is
       shorter than the ysize.

       This function performs fastest when the pitchedDevPtr has been allocated by
       cudaMalloc3D().

       cudaMemset3DAsync() is asynchronous with respect to the host, so the call may return
       before the memset is complete. The operation can optionally be associated to a stream by
       passing a non-zero stream argument. If stream is non-zero, the operation may overlap with
       operations in other streams.

       The device version of this function only handles device to device copies and cannot be
       given local or shared pointers.

       Parameters:
           pitchedDevPtr - Pointer to pitched device memory
           value - Value to set for each byte of specified memory
           extent - Size parameters for where to set device memory (width field in bytes)
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue,

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           See also .

           This function uses standard  semantics.

       See also:
           cudaMemset, cudaMemset2D, cudaMemset3D, cudaMemsetAsync, cudaMemset2DAsync,
           cudaMalloc3D, make_cudaPitchedPtr, make_cudaExtent

   __cudart_builtin__ cudaError_t cudaMemsetAsync (void * devPtr, int value, size_t count,
       cudaStream_t stream = 0)
       Fills the first count bytes of the memory area pointed to by devPtr with the constant byte
       value value.

       cudaMemsetAsync() is asynchronous with respect to the host, so the call may return before
       the memset is complete. The operation can optionally be associated to a stream by passing
       a non-zero stream argument. If stream is non-zero, the operation may overlap with
       operations in other streams.

       The device version of this function only handles device to device copies and cannot be
       given local or shared pointers.

       Parameters:
           devPtr - Pointer to device memory
           value - Value to set for each byte of specified memory
           count - Size in bytes to set
           stream - Stream identifier

       Returns:
           cudaSuccess, cudaErrorInvalidValue,

       Note:
           Note that this function may also return error codes from previous, asynchronous
           launches.

           See also .

           This function uses standard  semantics.

       See also:
           cudaMemset, cudaMemset2D, cudaMemset3D, cudaMemset2DAsync, cudaMemset3DAsync,
           cuMemsetD8Async, cuMemsetD16Async, cuMemsetD32Async

   struct cudaExtent make_cudaExtent (size_t w, size_t h, size_t d) [read]
       Returns a cudaExtent based on the specified input parameters w, h, and d.

       Parameters:
           w - Width in elements when referring to array memory, in bytes when referring to
           linear memory
           h - Height in elements
           d - Depth in elements

       Returns:
           cudaExtent specified by w, h, and d

       See also:
           make_cudaPitchedPtr, make_cudaPos

   struct cudaPitchedPtr make_cudaPitchedPtr (void * d, size_t p, size_t xsz, size_t ysz) [read]
       Returns a cudaPitchedPtr based on the specified input parameters d, p, xsz, and ysz.

       Parameters:
           d - Pointer to allocated memory
           p - Pitch of allocated memory in bytes
           xsz - Logical width of allocation in elements
           ysz - Logical height of allocation in elements

       Returns:
           cudaPitchedPtr specified by d, p, xsz, and ysz

       See also:
           make_cudaExtent, make_cudaPos

   struct cudaPos make_cudaPos (size_t x, size_t y, size_t z) [read]
       Returns a cudaPos based on the specified input parameters x, y, and z.

       Parameters:
           x - X position
           y - Y position
           z - Z position

       Returns:
           cudaPos specified by x, y, and z

       See also:
           make_cudaExtent, make_cudaPitchedPtr

Author

       Generated automatically by Doxygen from the source code.