template<typename Creator, typename Deleter>
cudaGraphBase class
class to create a CUDA graph managed by C++ smart pointer
Template parameters | |
---|---|
Creator | functor to create the stream (used in constructor) |
Deleter | functor to delete the stream (used in destructor) |
This class wraps a cudaGraph_t
handle with std::
Public types
-
using base_type = std::
unique_ptr<std::remove_pointer_t<cudaGraph_t>, Deleter> - base std::
unique_ptr type
Constructors, destructors, conversion operators
-
template<typename... ArgsT>cudaGraphBase(ArgsT && ... args) explicit
- constructs a
cudaGraph
object by passing the given arguments to the executable CUDA graph creator - cudaGraphBase(cudaGraphBase&&) defaulted
- constructs a
cudaGraph
from the given rhs using move semantics
Public functions
- auto operator=(cudaGraphBase&&) -> cudaGraphBase& defaulted
- assign the rhs to
*this
using move semantics - auto num_nodes() const -> size_t
- queries the number of nodes in a native CUDA graph
- auto num_edges() const -> size_t
- queries the number of edges in a native CUDA graph
- auto empty() const -> bool
- queries if the graph is empty
-
void dump(std::
ostream& os) - dumps the CUDA graph to a DOT format through the given output stream
- auto noop() -> cudaTask
- creates a no-operation task
-
template<typename C>auto host(C&& callable, void* user_data) -> cudaTask
- creates a host task that runs a callable on the host
-
template<typename F, typename... ArgsT>auto kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args) -> cudaTask
- creates a kernel task
- auto memset(void* dst, int v, size_t count) -> cudaTask
- creates a memset task that fills untyped data with a byte value
- auto memcpy(void* tgt, const void* src, size_t bytes) -> cudaTask
- creates a memcpy task that copies untyped data in bytes
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>auto zero(T* dst, size_t count) -> cudaTask
- creates a memset task that sets a typed memory block to zero
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>auto fill(T* dst, T value, size_t count) -> cudaTask
- creates a memset task that fills a typed memory block with a value
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>auto copy(T* tgt, const T* src, size_t num) -> cudaTask
- creates a memcopy task that copies typed data
-
template<typename C>auto single_task(C c) -> cudaTask
- runs a callable with only a single kernel thread
-
template<typename I, typename C, typename E = cudaDefaultExecutionPolicy>auto for_each(I first, I last, C callable) -> cudaTask
- applies a callable to each dereferenced element of the data array
-
template<typename I, typename C, typename E = cudaDefaultExecutionPolicy>auto for_each_index(I first, I last, I step, C callable) -> cudaTask
- applies a callable to each index in the range with the step size
-
template<typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy>auto transform(I first, I last, O output, C op) -> cudaTask
- applies a callable to a source range and stores the result in a target range
-
template<typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy>auto transform(I1 first1, I1 last1, I2 first2, O output, C op) -> cudaTask
- creates a task to perform parallel transforms over two ranges of items
Function documentation
template<typename Creator, typename Deleter>
template<typename... ArgsT>
tf:: cudaGraphBase<Creator, Deleter>:: cudaGraphBase(ArgsT && ... args) explicit
constructs a cudaGraph
object by passing the given arguments to the executable CUDA graph creator
Parameters | |
---|---|
args | arguments to pass to the executable CUDA graph creator |
Constructs a cudaGraph
object by passing the given arguments to the executable CUDA graph creator
template<typename Creator, typename Deleter>
void tf:: cudaGraphBase<Creator, Deleter>:: dump(std:: ostream& os)
dumps the CUDA graph to a DOT format through the given output stream
Parameters | |
---|---|
os | target output stream |
template<typename Creator, typename Deleter>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: noop()
creates a no-operation task
Returns | a tf:: |
---|
An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n
nodes with a barrier between them can be represented using an empty node and 2*n
dependency edges, rather than no empty node and n^2
dependency edges.
template<typename Creator, typename Deleter>
template<typename C>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: host(C&& callable,
void* user_data)
creates a host task that runs a callable on the host
Template parameters | |
---|---|
C | callable type |
Parameters | |
callable | a callable object with neither arguments nor return (i.e., constructible from std::function<void()> ) |
user_data | a pointer to the user data |
Returns | a tf:: |
A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc
).
template<typename Creator, typename Deleter>
template<typename F, typename... ArgsT>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: kernel(dim3 g,
dim3 b,
size_t s,
F f,
ArgsT... args)
creates a kernel task
Template parameters | |
---|---|
F | kernel function type |
ArgsT | kernel function parameters type |
Parameters | |
g | configured grid |
b | configured block |
s | configured shared memory size in bytes |
f | kernel function |
args | arguments to forward to the kernel function by copy |
Returns | a tf:: |
template<typename Creator, typename Deleter>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: memset(void* dst,
int v,
size_t count)
creates a memset task that fills untyped data with a byte value
Parameters | |
---|---|
dst | pointer to the destination device memory area |
v | value to set for each byte of specified memory |
count | size in bytes to set |
Returns | a tf:: |
A memset task fills the first count
bytes of device memory area pointed by dst
with the byte value v
.
template<typename Creator, typename Deleter>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: memcpy(void* tgt,
const void* src,
size_t bytes)
creates a memcpy task that copies untyped data in bytes
Parameters | |
---|---|
tgt | pointer to the target memory block |
src | pointer to the source memory block |
bytes | bytes to copy |
Returns | a tf:: |
A memcpy task transfers bytes
of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
template<typename Creator, typename Deleter>
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: zero(T* dst,
size_t count)
creates a memset task that sets a typed memory block to zero
Template parameters | |
---|---|
T | element type (size of T must be either 1, 2, or 4) |
Parameters | |
dst | pointer to the destination device memory area |
count | number of elements |
Returns | a tf:: |
A zero task zeroes the first count
elements of type T
in a device memory area pointed by dst
.
template<typename Creator, typename Deleter>
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: fill(T* dst,
T value,
size_t count)
creates a memset task that fills a typed memory block with a value
Template parameters | |
---|---|
T | element type (size of T must be either 1, 2, or 4) |
Parameters | |
dst | pointer to the destination device memory area |
value | value to fill for each element of type T |
count | number of elements |
Returns | a tf:: |
A fill task fills the first count
elements of type T
with value
in a device memory area pointed by dst
. The value to fill is interpreted in type T
rather than byte.
template<typename Creator, typename Deleter>
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: copy(T* tgt,
const T* src,
size_t num)
creates a memcopy task that copies typed data
Template parameters | |
---|---|
T | element type (non-void) |
Parameters | |
tgt | pointer to the target memory block |
src | pointer to the source memory block |
num | number of elements to copy |
Returns | a tf:: |
A copy task transfers num*sizeof(T)
bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
template<typename Creator, typename Deleter>
template<typename C>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: single_task(C c)
runs a callable with only a single kernel thread
Template parameters | |
---|---|
C | callable type |
Parameters | |
c | callable to run by a single kernel thread |
Returns | a tf:: |
template<typename Creator, typename Deleter>
template<typename I, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: for_each(I first,
I last,
C callable)
applies a callable to each dereferenced element of the data array
Template parameters | |
---|---|
I | iterator type |
C | callable type |
E | execution poligy (default tf::cudaDefaultExecutionPolicy) |
Parameters | |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
callable | a callable object to apply to the dereferenced iterator |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
for(auto itr = first; itr != last; itr++) { callable(*itr); }
template<typename Creator, typename Deleter>
template<typename I, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: for_each_index(I first,
I last,
I step,
C callable)
applies a callable to each index in the range with the step size
Template parameters | |
---|---|
I | index type |
C | callable type |
E | execution poligy (default tf::cudaDefaultExecutionPolicy) |
Parameters | |
first | beginning index |
last | last index |
step | step size |
callable | the callable to apply to each element in the data array |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
// step is positive [first, last) for(auto i=first; i<last; i+=step) { callable(i); } // step is negative [first, last) for(auto i=first; i>last; i+=step) { callable(i); }
template<typename Creator, typename Deleter>
template<typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: transform(I first,
I last,
O output,
C op)
applies a callable to a source range and stores the result in a target range
Template parameters | |
---|---|
I | input iterator type |
O | output iterator type |
C | unary operator type |
E | execution poligy (default tf::cudaDefaultExecutionPolicy) |
Parameters | |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
op | the operator to apply to transform each element in the range |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *output++ = callable(*first++); }
template<typename Creator, typename Deleter>
template<typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf:: cudaGraphBase<Creator, Deleter>:: transform(I1 first1,
I1 last1,
I2 first2,
O output,
C op)
creates a task to perform parallel transforms over two ranges of items
Template parameters | |
---|---|
I1 | first input iterator type |
I2 | second input iterator type |
O | output iterator type |
C | unary operator type |
E | execution poligy (default tf::cudaDefaultExecutionPolicy) |
Parameters | |
first1 | iterator to the beginning of the input range |
last1 | iterator to the end of the input range |
first2 | iterato |
output | iterator to the beginning of the output range |
op | binary operator to apply to transform each pair of items in the two input ranges |
Returns | cudaTask handle |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first1 != last1) { *output++ = op(*first1++, *first2++); }