Loading...
Searching...
No Matches
cuda_memory.hpp
1#pragma once
2
3#include "cuda_device.hpp"
4
9
10namespace tf {
11
12// ----------------------------------------------------------------------------
13// memory
14// ----------------------------------------------------------------------------
15
19inline size_t cuda_get_free_mem(int d) {
20 cudaScopedDevice ctx(d);
21 size_t free, total;
22 TF_CHECK_CUDA(
23 cudaMemGetInfo(&free, &total), "failed to get mem info on device ", d
24 );
25 return free;
26}
27
31inline size_t cuda_get_total_mem(int d) {
32 cudaScopedDevice ctx(d);
33 size_t free, total;
34 TF_CHECK_CUDA(
35 cudaMemGetInfo(&free, &total), "failed to get mem info on device ", d
36 );
37 return total;
38}
39
47template <typename T>
48T* cuda_malloc_device(size_t N, int d) {
49 cudaScopedDevice ctx(d);
50 T* ptr {nullptr};
51 TF_CHECK_CUDA(
52 cudaMalloc(&ptr, N*sizeof(T)),
53 "failed to allocate memory (", N*sizeof(T), "bytes) on device ", d
54 )
55 return ptr;
56}
57
64template <typename T>
65T* cuda_malloc_device(size_t N) {
66 T* ptr {nullptr};
67 TF_CHECK_CUDA(
68 cudaMalloc(&ptr, N*sizeof(T)),
69 "failed to allocate memory (", N*sizeof(T), "bytes)"
70 )
71 return ptr;
72}
73
80template <typename T>
81T* cuda_malloc_shared(size_t N) {
82 T* ptr {nullptr};
83 TF_CHECK_CUDA(
84 cudaMallocManaged(&ptr, N*sizeof(T)),
85 "failed to allocate shared memory (", N*sizeof(T), "bytes)"
86 )
87 return ptr;
88}
89
100template <typename T>
101void cuda_free(T* ptr, int d) {
102 cudaScopedDevice ctx(d);
103 TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr, " on GPU ", d);
104}
105
115template <typename T>
116void cuda_free(T* ptr) {
117 TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr);
118}
119
133 cudaStream_t stream, void* dst, const void* src, size_t count
134) {
135 TF_CHECK_CUDA(
136 cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
137 "failed to perform cudaMemcpyAsync"
138 );
139}
140
154 cudaStream_t stream, void* devPtr, int value, size_t count
155){
156 TF_CHECK_CUDA(
157 cudaMemsetAsync(devPtr, value, count, stream),
158 "failed to perform cudaMemsetAsync"
159 );
160}
161
162// ----------------------------------------------------------------------------
163// Shared Memory
164// ----------------------------------------------------------------------------
165//
166// Because dynamically sized shared memory arrays are declared "extern",
167// we can't templatize them directly. To get around this, we declare a
168// simple wrapper struct that will declare the extern array with a different
169// name depending on the type. This avoids compiler errors about duplicate
170// definitions.
171//
172// To use dynamically allocated shared memory in a templatized __global__ or
173// __device__ function, just replace code like this:
174//
175// template<class T>
176// __global__ void
177// foo( T* g_idata, T* g_odata)
178// {
179// // Shared mem size is determined by the host app at run time
180// extern __shared__ T sdata[];
181// ...
182// doStuff(sdata);
183// ...
184// }
185//
186// With this:
187//
188// template<class T>
189// __global__ void
190// foo( T* g_idata, T* g_odata)
191// {
192// // Shared mem size is determined by the host app at run time
193// cudaSharedMemory<T> smem;
194// T* sdata = smem.get();
195// ...
196// doStuff(sdata);
197// ...
198// }
199// ----------------------------------------------------------------------------
200
201// This is the un-specialized struct. Note that we prevent instantiation of this
202// struct by putting an undefined symbol in the function body so it won't compile.
206template <typename T>
207struct cudaSharedMemory
208{
209 // Ensure that we won't compile any un-specialized types
210 __device__ T *get()
211 {
212 extern __device__ void error(void);
213 error();
214 return NULL;
215 }
216};
217
218// Following are the specializations for the following types.
219// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
220// One could also specialize it for user-defined types.
221
225template <>
226struct cudaSharedMemory <int>
227{
228 __device__ int *get()
229 {
230 extern __shared__ int s_int[];
231 return s_int;
232 }
233};
234
238template <>
239struct cudaSharedMemory <unsigned int>
240{
241 __device__ unsigned int *get()
242 {
243 extern __shared__ unsigned int s_uint[];
244 return s_uint;
245 }
246};
247
251template <>
252struct cudaSharedMemory <char>
253{
254 __device__ char *get()
255 {
256 extern __shared__ char s_char[];
257 return s_char;
258 }
259};
260
264template <>
265struct cudaSharedMemory <unsigned char>
266{
267 __device__ unsigned char *get()
268 {
269 extern __shared__ unsigned char s_uchar[];
270 return s_uchar;
271 }
272};
273
277template <>
278struct cudaSharedMemory <short>
279{
280 __device__ short *get()
281 {
282 extern __shared__ short s_short[];
283 return s_short;
284 }
285};
286
290template <>
291struct cudaSharedMemory <unsigned short>
292{
293 __device__ unsigned short *get()
294 {
295 extern __shared__ unsigned short s_ushort[];
296 return s_ushort;
297 }
298};
299
303template <>
304struct cudaSharedMemory <long>
305{
306 __device__ long *get()
307 {
308 extern __shared__ long s_long[];
309 return s_long;
310 }
311};
312
316template <>
317struct cudaSharedMemory <unsigned long>
318{
319 __device__ unsigned long *get()
320 {
321 extern __shared__ unsigned long s_ulong[];
322 return s_ulong;
323 }
324};
325
326//template <>
327//struct cudaSharedMemory <size_t>
328//{
329// __device__ size_t *get()
330// {
331// extern __shared__ size_t s_sizet[];
332// return s_sizet;
333// }
334//};
335
339template <>
340struct cudaSharedMemory <bool>
341{
342 __device__ bool *get()
343 {
344 extern __shared__ bool s_bool[];
345 return s_bool;
346 }
347};
348
352template <>
353struct cudaSharedMemory <float>
354{
355 __device__ float *get()
356 {
357 extern __shared__ float s_float[];
358 return s_float;
359 }
360};
361
365template <>
366struct cudaSharedMemory <double>
367{
368 __device__ double *get()
369 {
370 extern __shared__ double s_double[];
371 return s_double;
372 }
373};
374
375
376
377// ----------------------------------------------------------------------------
378// cudaDeviceAllocator
379// ----------------------------------------------------------------------------
380
384template<typename T>
385class cudaDeviceAllocator {
386
387 public:
388
392 using value_type = T;
393
397 using pointer = T*;
398
402 using reference = T&;
403
407 using const_pointer = const T*;
408
412 using const_reference = const T&;
413
417 using size_type = std::size_t;
418
422 using difference_type = std::ptrdiff_t;
423
427 template<typename U>
428 struct rebind {
432 using other = cudaDeviceAllocator<U>;
433 };
434
438 cudaDeviceAllocator() noexcept {}
439
443 cudaDeviceAllocator( const cudaDeviceAllocator& ) noexcept {}
444
449 template<typename U>
450 cudaDeviceAllocator( const cudaDeviceAllocator<U>& ) noexcept {}
451
455 ~cudaDeviceAllocator() noexcept {}
456
465 pointer address( reference x ) { return &x; }
466
475 const_pointer address( const_reference x ) const { return &x; }
476
493 pointer allocate( size_type n, const void* = 0 )
494 {
495 void* ptr = NULL;
496 TF_CHECK_CUDA(
497 cudaMalloc( &ptr, n*sizeof(T) ),
498 "failed to allocate ", n, " elements (", n*sizeof(T), "bytes)"
499 )
500 return static_cast<pointer>(ptr);
501 }
502
510 void deallocate( pointer ptr, size_type )
511 {
512 if(ptr){
513 cudaFree(ptr);
514 }
515 }
516
527 size_type max_size() const noexcept { return size_type {-1}; }
528
532 void construct( pointer, const_reference) { }
533
537 void destroy( pointer) { }
538
546 template <typename U>
547 bool operator == (const cudaDeviceAllocator<U>&) const noexcept {
548 return true;
549 }
550
558 template <typename U>
559 bool operator != (const cudaDeviceAllocator<U>&) const noexcept {
560 return false;
561 }
562
563};
564
565// ----------------------------------------------------------------------------
566// cudaUSMAllocator
567// ----------------------------------------------------------------------------
568
572template<typename T>
573class cudaUSMAllocator {
574
575 public:
576
580 using value_type = T;
581
585 using pointer = T*;
586
590 using reference = T&;
591
595 using const_pointer = const T*;
596
600 using const_reference = const T&;
601
605 using size_type = std::size_t;
606
610 using difference_type = std::ptrdiff_t;
611
615 template<typename U>
616 struct rebind {
620 using other = cudaUSMAllocator<U>;
621 };
622
626 cudaUSMAllocator() noexcept {}
627
631 cudaUSMAllocator( const cudaUSMAllocator& ) noexcept {}
632
637 template<typename U>
638 cudaUSMAllocator( const cudaUSMAllocator<U>& ) noexcept {}
639
643 ~cudaUSMAllocator() noexcept {}
644
653 pointer address( reference x ) { return &x; }
654
663 const_pointer address( const_reference x ) const { return &x; }
664
681 pointer allocate( size_type n, const void* = 0 )
682 {
683 void* ptr {nullptr};
684 TF_CHECK_CUDA(
685 cudaMallocManaged( &ptr, n*sizeof(T) ),
686 "failed to allocate ", n, " elements (", n*sizeof(T), "bytes)"
687 )
688 return static_cast<pointer>(ptr);
689 }
690
698 void deallocate( pointer ptr, size_type )
699 {
700 if(ptr){
701 cudaFree(ptr);
702 }
703 }
704
715 size_type max_size() const noexcept { return size_type {-1}; }
716
724 void construct( pointer ptr, const_reference val ) {
725 new ((void*)ptr) value_type(val);
726 }
727
736 void destroy( pointer ptr ) {
737 ptr->~value_type();
738 }
739
747 template <typename U>
748 bool operator == (const cudaUSMAllocator<U>&) const noexcept {
749 return true;
750 }
751
759 template <typename U>
760 bool operator != (const cudaUSMAllocator<U>&) const noexcept {
761 return false;
762 }
763
764};
765
766// ----------------------------------------------------------------------------
767// GPU vector object
768// ----------------------------------------------------------------------------
769
770//template <typename T>
771//using cudaDeviceVector = std::vector<NoInit<T>, cudaDeviceAllocator<NoInit<T>>>;
772
773//template <typename T>
774//using cudaUSMVector = std::vector<T, cudaUSMAllocator<T>>;
775
779template <typename T>
780class cudaDeviceVector {
781
782 public:
783
784 cudaDeviceVector() = default;
785
786 cudaDeviceVector(size_t N) : _N {N} {
787 if(N) {
788 TF_CHECK_CUDA(
789 cudaMalloc(&_data, N*sizeof(T)),
790 "failed to allocate device memory (", N*sizeof(T), " bytes)"
791 );
792 }
793 }
794
795 cudaDeviceVector(cudaDeviceVector&& rhs) :
796 _data{rhs._data}, _N {rhs._N} {
797 rhs._data = nullptr;
798 rhs._N = 0;
799 }
800
801 ~cudaDeviceVector() {
802 if(_data) {
803 cudaFree(_data);
804 }
805 }
806
807 cudaDeviceVector& operator = (cudaDeviceVector&& rhs) {
808 if(_data) {
809 cudaFree(_data);
810 }
811 _data = rhs._data;
812 _N = rhs._N;
813 rhs._data = nullptr;
814 rhs._N = 0;
815 return *this;
816 }
817
818 size_t size() const { return _N; }
819
820 T* data() { return _data; }
821 const T* data() const { return _data; }
822
823 cudaDeviceVector(const cudaDeviceVector&) = delete;
824 cudaDeviceVector& operator = (const cudaDeviceVector&) = delete;
825
826 private:
827
828 T* _data {nullptr};
829 size_t _N {0};
830};
831
832
833} // end of namespace tf -----------------------------------------------------
834
835
836
837
838
839
class to create an RAII-styled context switch
Definition cuda_device.hpp:289
taskflow namespace
Definition small_vector.hpp:20
size_t cuda_get_free_mem(int d)
queries the free memory (expensive call)
Definition cuda_memory.hpp:19
T * cuda_malloc_device(size_t N, int d)
allocates memory on the given device for holding N elements of type T
Definition cuda_memory.hpp:48
size_t cuda_get_total_mem(int d)
queries the total available memory (expensive call)
Definition cuda_memory.hpp:31
void cuda_memset_async(cudaStream_t stream, void *devPtr, int value, size_t count)
initializes or sets GPU memory to the given value byte by byte
Definition cuda_memory.hpp:153
void cuda_memcpy_async(cudaStream_t stream, void *dst, const void *src, size_t count)
copies data between host and device asynchronously through a stream
Definition cuda_memory.hpp:132
void cuda_free(T *ptr, int d)
frees memory on the GPU device
Definition cuda_memory.hpp:101
T * cuda_malloc_shared(size_t N)
allocates shared memory for holding N elements of type T
Definition cuda_memory.hpp:81
its member type U is the equivalent allocator type to allocate elements of type U
Definition cuda_memory.hpp:428
cudaDeviceAllocator< U > other
allocator of a different data type
Definition cuda_memory.hpp:432
its member type U is the equivalent allocator type to allocate elements of type U
Definition cuda_memory.hpp:616
cudaUSMAllocator< U > other
allocator of a different data type
Definition cuda_memory.hpp:620