taskflow/cuda__memory_8hpp_source.html

#pragma once


#include "cuda_device.hpp"


namespace tf {


// ----------------------------------------------------------------------------

// memory

// ----------------------------------------------------------------------------


inline size_t cuda_get_free_mem(int d) {

  cudaScopedDevice ctx(d);

  size_t free, total;

  TF_CHECK_CUDA(

    cudaMemGetInfo(&free, &total), "failed to get mem info on device ", d

  );

  return free;

}


inline size_t cuda_get_total_mem(int d) {

  cudaScopedDevice ctx(d);

  size_t free, total;

  TF_CHECK_CUDA(

    cudaMemGetInfo(&free, &total), "failed to get mem info on device ", d

  );

  return total;

}


template <typename T>


T* cuda_malloc_device(size_t N, int d) {

  cudaScopedDevice ctx(d);

  T* ptr {nullptr};

  TF_CHECK_CUDA(

    cudaMalloc(&ptr, N*sizeof(T)),

    "failed to allocate memory (", N*sizeof(T), "bytes) on device ", d

  )

  return ptr;

}


template <typename T>


T* cuda_malloc_device(size_t N) {

  T* ptr {nullptr};

  TF_CHECK_CUDA(

    cudaMalloc(&ptr, N*sizeof(T)),

    "failed to allocate memory (", N*sizeof(T), "bytes)"

  )

  return ptr;

}


template <typename T>


T* cuda_malloc_shared(size_t N) {

  T* ptr {nullptr};

  TF_CHECK_CUDA(

    cudaMallocManaged(&ptr, N*sizeof(T)),

    "failed to allocate shared memory (", N*sizeof(T), "bytes)"

  )

  return ptr;

}


template <typename T>


void cuda_free(T* ptr, int d) {

  cudaScopedDevice ctx(d);

  TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr, " on GPU ", d);

}


template <typename T>


void cuda_free(T* ptr) {

  TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr);

}


inline void cuda_memcpy_async(

  cudaStream_t stream, void* dst, const void* src, size_t count

) {

  TF_CHECK_CUDA(

    cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),

    "failed to perform cudaMemcpyAsync"

  );

}


inline void cuda_memset_async(

  cudaStream_t stream, void* devPtr, int value, size_t count

){

  TF_CHECK_CUDA(

    cudaMemsetAsync(devPtr, value, count, stream),

    "failed to perform cudaMemsetAsync"

  );

}


// ----------------------------------------------------------------------------

// Shared Memory

// ----------------------------------------------------------------------------

//

// Because dynamically sized shared memory arrays are declared "extern",

// we can't templatize them directly.  To get around this, we declare a

// simple wrapper struct that will declare the extern array with a different

// name depending on the type.  This avoids compiler errors about duplicate

// definitions.

//

// To use dynamically allocated shared memory in a templatized __global__ or

// __device__ function, just replace code like this:

//

//  template<class T>

//  __global__ void

//  foo( T* g_idata, T* g_odata)

//  {

//      // Shared mem size is determined by the host app at run time

//      extern __shared__  T sdata[];

//      ...

//      doStuff(sdata);

//      ...

//   }

//

//  With this:

//

//  template<class T>

//  __global__ void

//  foo( T* g_idata, T* g_odata)

//  {

//      // Shared mem size is determined by the host app at run time

//      cudaSharedMemory<T> smem;

//      T* sdata = smem.get();

//      ...

//      doStuff(sdata);

//      ...

//   }

// ----------------------------------------------------------------------------


// This is the un-specialized struct.  Note that we prevent instantiation of this

// struct by putting an undefined symbol in the function body so it won't compile.

template <typename T>

struct cudaSharedMemory

{

  // Ensure that we won't compile any un-specialized types

  __device__ T *get()

  {

    extern __device__ void error(void);

    error();

    return NULL;

  }

};


// Following are the specializations for the following types.

// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double

// One could also specialize it for user-defined types.


template <>

struct cudaSharedMemory <int>

{

  __device__ int *get()

  {

    extern __shared__ int s_int[];

    return s_int;

  }

};


template <>

struct cudaSharedMemory <unsigned int>

{

  __device__ unsigned int *get()

  {

    extern __shared__ unsigned int s_uint[];

    return s_uint;

  }

};


template <>

struct cudaSharedMemory <char>

{

  __device__ char *get()

  {

    extern __shared__ char s_char[];

    return s_char;

  }

};


template <>

struct cudaSharedMemory <unsigned char>

{

  __device__ unsigned char *get()

  {

    extern __shared__ unsigned char s_uchar[];

    return s_uchar;

  }

};


template <>

struct cudaSharedMemory <short>

{

  __device__ short *get()

  {

    extern __shared__ short s_short[];

    return s_short;

  }

};


template <>

struct cudaSharedMemory <unsigned short>

{

  __device__ unsigned short *get()

  {

    extern __shared__ unsigned short s_ushort[];

    return s_ushort;

  }

};


template <>

struct cudaSharedMemory <long>

{

  __device__ long *get()

  {

    extern __shared__ long s_long[];

    return s_long;

  }

};


template <>

struct cudaSharedMemory <unsigned long>

{

  __device__ unsigned long *get()

  {

    extern __shared__ unsigned long s_ulong[];

    return s_ulong;

  }

};


//template <>

//struct cudaSharedMemory <size_t>

//{

//  __device__ size_t *get()

//  {

//    extern __shared__ size_t s_sizet[];

//    return s_sizet;

//  }

//};


template <>

struct cudaSharedMemory <bool>

{

  __device__ bool *get()

  {

    extern __shared__ bool s_bool[];

    return s_bool;

  }

};


template <>

struct cudaSharedMemory <float>

{

  __device__ float *get()

  {

    extern __shared__ float s_float[];

    return s_float;

  }

};


template <>

struct cudaSharedMemory <double>

{

  __device__ double *get()

  {

    extern __shared__ double s_double[];

    return s_double;

  }

};


// ----------------------------------------------------------------------------

// cudaDeviceAllocator

// ----------------------------------------------------------------------------


template<typename T>

class cudaDeviceAllocator {


  public:


  using value_type = T;


  using pointer = T*;


  using reference = T&;


  using const_pointer = const T*;


  using const_reference = const T&;


  using size_type = std::size_t;


  using difference_type = std::ptrdiff_t;


  template<typename U>


  struct rebind {

    using other = cudaDeviceAllocator<U>;

  };


  cudaDeviceAllocator() noexcept {}


  cudaDeviceAllocator( const cudaDeviceAllocator& ) noexcept {}


  template<typename U>

  cudaDeviceAllocator( const cudaDeviceAllocator<U>& ) noexcept {}


  ~cudaDeviceAllocator() noexcept {}


  pointer address( reference x ) { return &x; }


  const_pointer address( const_reference x ) const { return &x; }


  pointer allocate( size_type n, const void* = 0 )

  {

    void* ptr = NULL;

    TF_CHECK_CUDA(

      cudaMalloc( &ptr, n*sizeof(T) ),

      "failed to allocate ", n, " elements (", n*sizeof(T), "bytes)"

    )

    return static_cast<pointer>(ptr);

  }


  void deallocate( pointer ptr, size_type )

  {

    if(ptr){

      cudaFree(ptr);

    }

  }


  size_type max_size() const noexcept { return size_type {-1}; }


  void construct( pointer, const_reference) { }


  void destroy( pointer) { }


  template <typename U>

  bool operator == (const cudaDeviceAllocator<U>&) const noexcept {

    return true;

  }


  template <typename U>

  bool operator != (const cudaDeviceAllocator<U>&) const noexcept {

    return false;

  }


};


// ----------------------------------------------------------------------------

// cudaUSMAllocator

// ----------------------------------------------------------------------------


template<typename T>

class cudaUSMAllocator {


  public:


  using value_type = T;


  using pointer = T*;


  using reference = T&;


  using const_pointer = const T*;


  using const_reference = const T&;


  using size_type = std::size_t;


  using difference_type = std::ptrdiff_t;


  template<typename U>


  struct rebind {

    using other = cudaUSMAllocator<U>;

  };


  cudaUSMAllocator() noexcept {}


  cudaUSMAllocator( const cudaUSMAllocator& ) noexcept {}


  template<typename U>

  cudaUSMAllocator( const cudaUSMAllocator<U>& ) noexcept {}


  ~cudaUSMAllocator() noexcept {}


  pointer address( reference x ) { return &x; }


  const_pointer address( const_reference x ) const { return &x; }


  pointer allocate( size_type n, const void* = 0 )

  {

    void* ptr {nullptr};

    TF_CHECK_CUDA(

      cudaMallocManaged( &ptr, n*sizeof(T) ),

      "failed to allocate ", n, " elements (", n*sizeof(T), "bytes)"

    )

    return static_cast<pointer>(ptr);

  }


  void deallocate( pointer ptr, size_type )

  {

    if(ptr){

      cudaFree(ptr);

    }

  }


  size_type max_size() const noexcept { return size_type {-1}; }


  void construct( pointer ptr, const_reference val ) {

    new ((void*)ptr) value_type(val);

  }


  void destroy( pointer ptr ) {

    ptr->~value_type();

  }


  template <typename U>

  bool operator == (const cudaUSMAllocator<U>&) const noexcept {

    return true;

  }


  template <typename U>

  bool operator != (const cudaUSMAllocator<U>&) const noexcept {

    return false;

  }


};


// ----------------------------------------------------------------------------

// GPU vector object

// ----------------------------------------------------------------------------


//template <typename T>

//using cudaDeviceVector = std::vector<NoInit<T>, cudaDeviceAllocator<NoInit<T>>>;


//template <typename T>

//using cudaUSMVector = std::vector<T, cudaUSMAllocator<T>>;


template <typename T>

class cudaDeviceVector {


  public:


    cudaDeviceVector() = default;


    cudaDeviceVector(size_t N) : _N {N} {

      if(N) {

        TF_CHECK_CUDA(

          cudaMalloc(&_data, N*sizeof(T)),

          "failed to allocate device memory (", N*sizeof(T), " bytes)"

        );

      }

    }


    cudaDeviceVector(cudaDeviceVector&& rhs) :

      _data{rhs._data}, _N {rhs._N} {

      rhs._data = nullptr;

      rhs._N    = 0;

    }


    ~cudaDeviceVector() {

      if(_data) {

        cudaFree(_data);

      }

    }


    cudaDeviceVector& operator = (cudaDeviceVector&& rhs) {

      if(_data) {

        cudaFree(_data);

      }

      _data = rhs._data;

      _N    = rhs._N;

      rhs._data = nullptr;

      rhs._N    = 0;

      return *this;

    }


    size_t size() const { return _N; }


    T* data() { return _data; }

    const T* data() const { return _data; }


    cudaDeviceVector(const cudaDeviceVector&) = delete;

    cudaDeviceVector& operator = (const cudaDeviceVector&) = delete;


  private:


    T* _data  {nullptr};

    size_t _N {0};

};


}  // end of namespace tf -----------------------------------------------------


tf::cudaScopedDevice
class to create an RAII-styled context switch
Definition cuda_device.hpp:289

tf
taskflow namespace
Definition small_vector.hpp:20

tf::cuda_get_free_mem
size_t cuda_get_free_mem(int d)
queries the free memory (expensive call)
Definition cuda_memory.hpp:19

tf::cuda_malloc_device
T * cuda_malloc_device(size_t N, int d)
allocates memory on the given device for holding N elements of type T
Definition cuda_memory.hpp:48

tf::cuda_get_total_mem
size_t cuda_get_total_mem(int d)
queries the total available memory (expensive call)
Definition cuda_memory.hpp:31

tf::cuda_memset_async
void cuda_memset_async(cudaStream_t stream, void *devPtr, int value, size_t count)
initializes or sets GPU memory to the given value byte by byte
Definition cuda_memory.hpp:153

tf::cuda_memcpy_async
void cuda_memcpy_async(cudaStream_t stream, void *dst, const void *src, size_t count)
copies data between host and device asynchronously through a stream
Definition cuda_memory.hpp:132

tf::cuda_free
void cuda_free(T *ptr, int d)
frees memory on the GPU device
Definition cuda_memory.hpp:101

tf::cuda_malloc_shared
T * cuda_malloc_shared(size_t N)
allocates shared memory for holding N elements of type T
Definition cuda_memory.hpp:81

tf::cudaDeviceAllocator::rebind
its member type U is the equivalent allocator type to allocate elements of type U
Definition cuda_memory.hpp:428

tf::cudaDeviceAllocator::rebind::other
cudaDeviceAllocator< U > other
allocator of a different data type
Definition cuda_memory.hpp:432

tf::cudaUSMAllocator::rebind
its member type U is the equivalent allocator type to allocate elements of type U
Definition cuda_memory.hpp:616

tf::cudaUSMAllocator::rebind::other
cudaUSMAllocator< U > other
allocator of a different data type
Definition cuda_memory.hpp:620