3#include "cuda_device.hpp"
23 cudaMemGetInfo(&free, &total),
"failed to get mem info on device ", d
35 cudaMemGetInfo(&free, &total),
"failed to get mem info on device ", d
52 cudaMalloc(&ptr, N*
sizeof(T)),
53 "failed to allocate memory (", N*
sizeof(T),
"bytes) on device ", d
68 cudaMalloc(&ptr, N*
sizeof(T)),
69 "failed to allocate memory (", N*
sizeof(T),
"bytes)"
84 cudaMallocManaged(&ptr, N*
sizeof(T)),
85 "failed to allocate shared memory (", N*
sizeof(T),
"bytes)"
103 TF_CHECK_CUDA(cudaFree(ptr),
"failed to free memory ", ptr,
" on GPU ", d);
117 TF_CHECK_CUDA(cudaFree(ptr),
"failed to free memory ", ptr);
133 cudaStream_t stream,
void* dst,
const void* src,
size_t count
136 cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
137 "failed to perform cudaMemcpyAsync"
154 cudaStream_t stream,
void* devPtr,
int value,
size_t count
157 cudaMemsetAsync(devPtr, value, count, stream),
158 "failed to perform cudaMemsetAsync"
207struct cudaSharedMemory
212 extern __device__
void error(
void);
226struct cudaSharedMemory <int>
228 __device__
int *get()
230 extern __shared__
int s_int[];
239struct cudaSharedMemory <unsigned int>
241 __device__
unsigned int *get()
243 extern __shared__
unsigned int s_uint[];
252struct cudaSharedMemory <char>
254 __device__
char *get()
256 extern __shared__
char s_char[];
265struct cudaSharedMemory <unsigned char>
267 __device__
unsigned char *get()
269 extern __shared__
unsigned char s_uchar[];
278struct cudaSharedMemory <short>
280 __device__
short *get()
282 extern __shared__
short s_short[];
291struct cudaSharedMemory <unsigned short>
293 __device__
unsigned short *get()
295 extern __shared__
unsigned short s_ushort[];
304struct cudaSharedMemory <long>
306 __device__
long *get()
308 extern __shared__
long s_long[];
317struct cudaSharedMemory <unsigned long>
319 __device__
unsigned long *get()
321 extern __shared__
unsigned long s_ulong[];
340struct cudaSharedMemory <bool>
342 __device__
bool *get()
344 extern __shared__
bool s_bool[];
353struct cudaSharedMemory <float>
355 __device__
float *get()
357 extern __shared__
float s_float[];
366struct cudaSharedMemory <double>
368 __device__
double *get()
370 extern __shared__
double s_double[];
385class cudaDeviceAllocator {
392 using value_type = T;
402 using reference = T&;
407 using const_pointer =
const T*;
412 using const_reference =
const T&;
417 using size_type = std::size_t;
422 using difference_type = std::ptrdiff_t;
432 using other = cudaDeviceAllocator<U>;
438 cudaDeviceAllocator() noexcept {}
443 cudaDeviceAllocator(
const cudaDeviceAllocator& )
noexcept {}
450 cudaDeviceAllocator(
const cudaDeviceAllocator<U>& )
noexcept {}
455 ~cudaDeviceAllocator() noexcept {}
465 pointer address( reference x ) {
return &x; }
475 const_pointer address( const_reference x )
const {
return &x; }
493 pointer allocate( size_type n,
const void* = 0 )
497 cudaMalloc( &ptr, n*
sizeof(T) ),
498 "failed to allocate ", n,
" elements (", n*
sizeof(T),
"bytes)"
500 return static_cast<pointer>(ptr);
510 void deallocate( pointer ptr, size_type )
527 size_type max_size() const noexcept {
return size_type {-1}; }
532 void construct( pointer, const_reference) { }
537 void destroy( pointer) { }
546 template <
typename U>
547 bool operator == (
const cudaDeviceAllocator<U>&)
const noexcept {
558 template <
typename U>
559 bool operator != (
const cudaDeviceAllocator<U>&)
const noexcept {
573class cudaUSMAllocator {
580 using value_type = T;
590 using reference = T&;
595 using const_pointer =
const T*;
600 using const_reference =
const T&;
605 using size_type = std::size_t;
610 using difference_type = std::ptrdiff_t;
626 cudaUSMAllocator() noexcept {}
631 cudaUSMAllocator(
const cudaUSMAllocator& )
noexcept {}
638 cudaUSMAllocator(
const cudaUSMAllocator<U>& )
noexcept {}
643 ~cudaUSMAllocator() noexcept {}
653 pointer address( reference x ) {
return &x; }
663 const_pointer address( const_reference x )
const {
return &x; }
681 pointer allocate( size_type n,
const void* = 0 )
685 cudaMallocManaged( &ptr, n*
sizeof(T) ),
686 "failed to allocate ", n,
" elements (", n*
sizeof(T),
"bytes)"
688 return static_cast<pointer>(ptr);
698 void deallocate( pointer ptr, size_type )
715 size_type max_size() const noexcept {
return size_type {-1}; }
724 void construct( pointer ptr, const_reference val ) {
725 new ((
void*)ptr) value_type(val);
736 void destroy( pointer ptr ) {
747 template <
typename U>
748 bool operator == (
const cudaUSMAllocator<U>&)
const noexcept {
759 template <
typename U>
760 bool operator != (
const cudaUSMAllocator<U>&)
const noexcept {
780class cudaDeviceVector {
784 cudaDeviceVector() =
default;
786 cudaDeviceVector(
size_t N) : _N {N} {
789 cudaMalloc(&_data, N*
sizeof(T)),
790 "failed to allocate device memory (", N*
sizeof(T),
" bytes)"
795 cudaDeviceVector(cudaDeviceVector&& rhs) :
796 _data{rhs._data}, _N {rhs._N} {
801 ~cudaDeviceVector() {
807 cudaDeviceVector& operator = (cudaDeviceVector&& rhs) {
818 size_t size()
const {
return _N; }
820 T* data() {
return _data; }
821 const T* data()
const {
return _data; }
823 cudaDeviceVector(
const cudaDeviceVector&) =
delete;
824 cudaDeviceVector& operator = (
const cudaDeviceVector&) =
delete;
class to create an RAII-styled context switch
Definition cuda_device.hpp:289
taskflow namespace
Definition small_vector.hpp:20
size_t cuda_get_free_mem(int d)
queries the free memory (expensive call)
Definition cuda_memory.hpp:19
T * cuda_malloc_device(size_t N, int d)
allocates memory on the given device for holding N elements of type T
Definition cuda_memory.hpp:48
size_t cuda_get_total_mem(int d)
queries the total available memory (expensive call)
Definition cuda_memory.hpp:31
void cuda_memset_async(cudaStream_t stream, void *devPtr, int value, size_t count)
initializes or sets GPU memory to the given value byte by byte
Definition cuda_memory.hpp:153
void cuda_memcpy_async(cudaStream_t stream, void *dst, const void *src, size_t count)
copies data between host and device asynchronously through a stream
Definition cuda_memory.hpp:132
void cuda_free(T *ptr, int d)
frees memory on the GPU device
Definition cuda_memory.hpp:101
T * cuda_malloc_shared(size_t N)
allocates shared memory for holding N elements of type T
Definition cuda_memory.hpp:81
its member type U is the equivalent allocator type to allocate elements of type U
Definition cuda_memory.hpp:428
cudaDeviceAllocator< U > other
allocator of a different data type
Definition cuda_memory.hpp:432
its member type U is the equivalent allocator type to allocate elements of type U
Definition cuda_memory.hpp:616
cudaUSMAllocator< U > other
allocator of a different data type
Definition cuda_memory.hpp:620