class
#include <taskflow/cuda/cudaflow.hpp>
cudaFlow class to create a cudaFlow task dependency graph
A cudaFlow is a high-level interface over CUDA Graph to perform GPU operations using the task dependency graph model. The class provides a set of methods for creating and launch different tasks on one or multiple CUDA devices, for instance, kernel tasks, data transfer tasks, and memory operation tasks. The following example creates a cudaFlow of two kernel tasks, task1
and task2
, where task1
runs before task2
.
tf::Taskflow taskflow; tf::Executor executor; taskflow.emplace([&](tf::cudaFlow& cf){ // create two kernel tasks tf::cudaTask task1 = cf.kernel(grid1, block1, shm_size1, kernel1, args1); tf::cudaTask task2 = cf.kernel(grid2, block2, shm_size2, kernel2, args2); // kernel1 runs before kernel2 task1.precede(task2); }); executor.run(taskflow).wait();
A cudaFlow is a task (tf::
Please refer to GPU Tasking (cudaFlow) for details.
Constructors, destructors, conversion operators
Public functions
- auto operator=(cudaFlow&&) -> cudaFlow& defaulted
- default move assignment operator
- auto empty() const -> bool
- queries the emptiness of the graph
- auto num_tasks() const -> size_t
- queries the number of tasks
- void clear()
- clears the cudaFlow object
-
void dump(std::
ostream& os) const - dumps the cudaFlow graph into a DOT format through an output stream
-
void dump_native_graph(std::
ostream& os) const - dumps the native CUDA graph into a DOT format through an output stream
- auto noop() -> cudaTask
- creates a no-operation task
-
template<typename C>auto host(C&& callable) -> cudaTask
- creates a host task that runs a callable on the host
-
template<typename C>void host(cudaTask task, C&& callable)
- updates parameters of a host task
-
template<typename F, typename... ArgsT>auto kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args) -> cudaTask
- creates a kernel task
-
template<typename F, typename... ArgsT>void kernel(cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args)
- updates parameters of a kernel task
- auto memset(void* dst, int v, size_t count) -> cudaTask
- creates a memset task that fills untyped data with a byte value
- void memset(cudaTask task, void* dst, int ch, size_t count)
- updates parameters of a memset task
- auto memcpy(void* tgt, const void* src, size_t bytes) -> cudaTask
- creates a memcpy task that copies untyped data in bytes
- void memcpy(cudaTask task, void* tgt, const void* src, size_t bytes)
- updates parameters of a memcpy task
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>auto zero(T* dst, size_t count) -> cudaTask
- creates a memset task that sets a typed memory block to zero
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>void zero(cudaTask task, T* dst, size_t count)
- updates parameters of a memset task to a zero task
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>auto fill(T* dst, T value, size_t count) -> cudaTask
- creates a memset task that fills a typed memory block with a value
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>void fill(cudaTask task, T* dst, T value, size_t count)
- updates parameters of a memset task to a fill task
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>auto copy(T* tgt, const T* src, size_t num) -> cudaTask
- creates a memcopy task that copies typed data
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>void copy(cudaTask task, T* tgt, const T* src, size_t num)
- updates parameters of a memcpy task to a copy task
- void run(cudaStream_t stream)
- offloads the cudaFlow onto a GPU asynchronously via a stream
- auto native_graph() -> cudaGraph_t
- acquires a reference to the underlying CUDA graph
- auto native_executable() -> cudaGraphExec_t
- acquires a reference to the underlying CUDA graph executable
-
template<typename C>auto single_task(C c) -> cudaTask
- runs a callable with only a single kernel thread
-
template<typename C>void single_task(cudaTask task, C c)
- updates a single-threaded kernel task
-
template<typename I, typename C>auto for_each(I first, I last, C callable) -> cudaTask
- applies a callable to each dereferenced element of the data array
-
template<typename I, typename C>void for_each(cudaTask task, I first, I last, C callable)
- updates parameters of a kernel task created from tf::
cudaFlow:: for_each -
template<typename I, typename C>auto for_each_index(I first, I last, I step, C callable) -> cudaTask
- applies a callable to each index in the range with the step size
-
template<typename I, typename C>void for_each_index(cudaTask task, I first, I last, I step, C callable)
- updates parameters of a kernel task created from tf::
cudaFlow:: for_each_index -
template<typename I, typename O, typename C>auto transform(I first, I last, O output, C op) -> cudaTask
- applies a callable to a source range and stores the result in a target range
-
template<typename I, typename O, typename C>void transform(cudaTask task, I first, I last, O output, C c)
- updates parameters of a kernel task created from tf::
cudaFlow:: transform -
template<typename I1, typename I2, typename O, typename C>auto transform(I1 first1, I1 last1, I2 first2, O output, C op) -> cudaTask
- creates a task to perform parallel transforms over two ranges of items
-
template<typename I1, typename I2, typename O, typename C>void transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c)
- updates parameters of a kernel task created from tf::
cudaFlow:: transform -
template<typename C>auto capture(C&& callable) -> cudaTask
- constructs a subflow graph through tf::
cudaFlowCapturer -
template<typename C>void capture(cudaTask task, C callable)
- updates the captured child graph
Function documentation
void tf:: cudaFlow:: dump_native_graph(std:: ostream& os) const
dumps the native CUDA graph into a DOT format through an output stream
The native CUDA graph may be different from the upper-level cudaFlow graph when flow capture is involved.
cudaTask tf:: cudaFlow:: noop()
creates a no-operation task
Returns | a tf:: |
---|
An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n
nodes with a barrier between them can be represented using an empty node and 2*n
dependency edges, rather than no empty node and n^2
dependency edges.
template<typename C>
cudaTask tf:: cudaFlow:: host(C&& callable)
creates a host task that runs a callable on the host
Template parameters | |
---|---|
C | callable type |
Parameters | |
callable | a callable object with neither arguments nor return (i.e., constructible from std::function<void()> ) |
Returns | a tf:: |
A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc
).
template<typename C>
void tf:: cudaFlow:: host(cudaTask task,
C&& callable)
updates parameters of a host task
The method is similar to tf::
template<typename F, typename... ArgsT>
cudaTask tf:: cudaFlow:: kernel(dim3 g,
dim3 b,
size_t s,
F f,
ArgsT... args)
creates a kernel task
Template parameters | |
---|---|
F | kernel function type |
ArgsT | kernel function parameters type |
Parameters | |
g | configured grid |
b | configured block |
s | configured shared memory size in bytes |
f | kernel function |
args | arguments to forward to the kernel function by copy |
Returns | a tf:: |
template<typename F, typename... ArgsT>
void tf:: cudaFlow:: kernel(cudaTask task,
dim3 g,
dim3 b,
size_t shm,
F f,
ArgsT... args)
updates parameters of a kernel task
The method is similar to tf::
cudaTask tf:: cudaFlow:: memset(void* dst,
int v,
size_t count)
creates a memset task that fills untyped data with a byte value
Parameters | |
---|---|
dst | pointer to the destination device memory area |
v | value to set for each byte of specified memory |
count | size in bytes to set |
Returns | a tf:: |
A memset task fills the first count
bytes of device memory area pointed by dst
with the byte value v
.
void tf:: cudaFlow:: memset(cudaTask task,
void* dst,
int ch,
size_t count)
updates parameters of a memset task
The method is similar to tf::
cudaTask tf:: cudaFlow:: memcpy(void* tgt,
const void* src,
size_t bytes)
creates a memcpy task that copies untyped data in bytes
Parameters | |
---|---|
tgt | pointer to the target memory block |
src | pointer to the source memory block |
bytes | bytes to copy |
Returns | a tf:: |
A memcpy task transfers bytes
of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
void tf:: cudaFlow:: memcpy(cudaTask task,
void* tgt,
const void* src,
size_t bytes)
updates parameters of a memcpy task
The method is similar to tf::
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf:: cudaFlow:: zero(T* dst,
size_t count)
creates a memset task that sets a typed memory block to zero
Template parameters | |
---|---|
T | element type (size of T must be either 1, 2, or 4) |
Parameters | |
dst | pointer to the destination device memory area |
count | number of elements |
Returns | a tf:: |
A zero task zeroes the first count
elements of type T
in a device memory area pointed by dst
.
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
void tf:: cudaFlow:: zero(cudaTask task,
T* dst,
size_t count)
updates parameters of a memset task to a zero task
The method is similar to tf::
The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf:: cudaFlow:: fill(T* dst,
T value,
size_t count)
creates a memset task that fills a typed memory block with a value
Template parameters | |
---|---|
T | element type (size of T must be either 1, 2, or 4) |
Parameters | |
dst | pointer to the destination device memory area |
value | value to fill for each element of type T |
count | number of elements |
Returns | a tf:: |
A fill task fills the first count
elements of type T
with value
in a device memory area pointed by dst
. The value to fill is interpreted in type T
rather than byte.
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
void tf:: cudaFlow:: fill(cudaTask task,
T* dst,
T value,
size_t count)
updates parameters of a memset task to a fill task
The method is similar to tf::
The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf:: cudaFlow:: copy(T* tgt,
const T* src,
size_t num)
creates a memcopy task that copies typed data
Template parameters | |
---|---|
T | element type (non-void) |
Parameters | |
tgt | pointer to the target memory block |
src | pointer to the source memory block |
num | number of elements to copy |
Returns | a tf:: |
A copy task transfers num*sizeof(T)
bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
void tf:: cudaFlow:: copy(cudaTask task,
T* tgt,
const T* src,
size_t num)
updates parameters of a memcpy task to a copy task
The method is similar to tf::
void tf:: cudaFlow:: run(cudaStream_t stream)
offloads the cudaFlow onto a GPU asynchronously via a stream
Parameters | |
---|---|
stream | stream for performing this operation |
Offloads the present cudaFlow onto a GPU asynchronously via the given stream.
An offloaded cudaFlow forces the underlying graph to be instantiated. After the instantiation, you should not modify the graph topology but update node parameters.
template<typename C>
cudaTask tf:: cudaFlow:: single_task(C c)
runs a callable with only a single kernel thread
Template parameters | |
---|---|
C | callable type |
Parameters | |
c | callable to run by a single kernel thread |
Returns | a tf:: |
template<typename C>
void tf:: cudaFlow:: single_task(cudaTask task,
C c)
updates a single-threaded kernel task
This method is similar to cudaFlow::
template<typename I, typename C>
cudaTask tf:: cudaFlow:: for_each(I first,
I last,
C callable)
applies a callable to each dereferenced element of the data array
Template parameters | |
---|---|
I | iterator type |
C | callable type |
Parameters | |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
callable | a callable object to apply to the dereferenced iterator |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
for(auto itr = first; itr != last; itr++) { callable(*itr); }
template<typename I, typename C>
void tf:: cudaFlow:: for_each(cudaTask task,
I first,
I last,
C callable)
updates parameters of a kernel task created from tf::
The type of the iterators and the callable must be the same as the task created from tf::
template<typename I, typename C>
cudaTask tf:: cudaFlow:: for_each_index(I first,
I last,
I step,
C callable)
applies a callable to each index in the range with the step size
Template parameters | |
---|---|
I | index type |
C | callable type |
Parameters | |
first | beginning index |
last | last index |
step | step size |
callable | the callable to apply to each element in the data array |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
// step is positive [first, last) for(auto i=first; i<last; i+=step) { callable(i); } // step is negative [first, last) for(auto i=first; i>last; i+=step) { callable(i); }
template<typename I, typename C>
void tf:: cudaFlow:: for_each_index(cudaTask task,
I first,
I last,
I step,
C callable)
updates parameters of a kernel task created from tf::
The type of the iterators and the callable must be the same as the task created from tf::
template<typename I, typename O, typename C>
cudaTask tf:: cudaFlow:: transform(I first,
I last,
O output,
C op)
applies a callable to a source range and stores the result in a target range
Template parameters | |
---|---|
I | input iterator type |
O | output iterator type |
C | unary operator type |
Parameters | |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
op | the operator to apply to transform each element in the range |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *output++ = callable(*first++); }
template<typename I, typename O, typename C>
void tf:: cudaFlow:: transform(cudaTask task,
I first,
I last,
O output,
C c)
updates parameters of a kernel task created from tf::
The type of the iterators and the callable must be the same as the task created from tf::
template<typename I1, typename I2, typename O, typename C>
cudaTask tf:: cudaFlow:: transform(I1 first1,
I1 last1,
I2 first2,
O output,
C op)
creates a task to perform parallel transforms over two ranges of items
Template parameters | |
---|---|
I1 | first input iterator type |
I2 | second input iterator type |
O | output iterator type |
C | unary operator type |
Parameters | |
first1 | iterator to the beginning of the input range |
last1 | iterator to the end of the input range |
first2 | iterato |
output | iterator to the beginning of the output range |
op | binary operator to apply to transform each pair of items in the two input ranges |
Returns | cudaTask handle |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first1 != last1) { *output++ = op(*first1++, *first2++); }
template<typename I1, typename I2, typename O, typename C>
void tf:: cudaFlow:: transform(cudaTask task,
I1 first1,
I1 last1,
I2 first2,
O output,
C c)
updates parameters of a kernel task created from tf::
The type of the iterators and the callable must be the same as the task created from tf::
template<typename C>
cudaTask tf:: cudaFlow:: capture(C&& callable)
constructs a subflow graph through tf::
Template parameters | |
---|---|
C | callable type constructible from std::function<void(tf::cudaFlowCapturer&)> |
Parameters | |
callable | the callable to construct a capture flow |
Returns | a tf:: |
A captured subflow forms a sub-graph to the cudaFlow and can be used to capture custom (or third-party) kernels that cannot be directly constructed from the cudaFlow.
Example usage:
taskflow.emplace([&](tf::cudaFlow& cf){ tf::cudaTask my_kernel = cf.kernel(my_arguments); // create a flow capturer to capture custom kernels tf::cudaTask my_subflow = cf.capture([&](tf::cudaFlowCapturer& capturer){ capturer.on([&](cudaStream_t stream){ invoke_custom_kernel_with_stream(stream, custom_arguments); }); }); my_kernel.precede(my_subflow); });
template<typename C>
void tf:: cudaFlow:: capture(cudaTask task,
C callable)
updates the captured child graph
The method is similar to tf::