class
#include <taskflow/cuda/cudaflow.hpp>
cudaFlow class to create a cudaFlow task dependency graph
A cudaFlow is a high-level interface over CUDA Graph to perform GPU operations using the task dependency graph model. The class provides a set of methods for creating and launch different tasks on one or multiple CUDA devices, for instance, kernel tasks, data transfer tasks, and memory operation tasks. The following example creates a cudaFlow of two kernel tasks, task1
and task2
, where task1
runs before task2
.
tf::cudaStream stream; tf::cudaFlow cf; // create two kernel tasks tf::cudaTask task1 = cf.kernel(grid1, block1, shm_size1, kernel1, args1); tf::cudaTask task2 = cf.kernel(grid2, block2, shm_size2, kernel2, args2); // kernel1 runs before kernel2 task1.precede(task2); // create an executable graph from the cudaflow cudaGraphExec exec = cf.instantiate(); // run the executable graph through the given stream exec.run(stream);
Please refer to GPU Tasking (cudaFlow) for details.
Constructors, destructors, conversion operators
Public functions
- auto noop() -> cudaTask
- creates a no-operation task
-
template<typename C>auto host(C&& callable, void* user_data) -> cudaTask
- creates a host task that runs a callable on the host
-
template<typename C>void host(cudaTask task, C&& callable)
- updates parameters of a host task
-
template<typename F, typename... ArgsT>auto kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args) -> cudaTask
- creates a kernel task
-
template<typename F, typename... ArgsT>void kernel(cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args)
- updates parameters of a kernel task
- auto memset(void* dst, int v, size_t count) -> cudaTask
- creates a memset task that fills untyped data with a byte value
- void memset(cudaTask task, void* dst, int ch, size_t count)
- updates parameters of a memset task
- auto memcpy(void* tgt, const void* src, size_t bytes) -> cudaTask
- creates a memcpy task that copies untyped data in bytes
- void memcpy(cudaTask task, void* tgt, const void* src, size_t bytes)
- updates parameters of a memcpy task
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>auto zero(T* dst, size_t count) -> cudaTask
- creates a memset task that sets a typed memory block to zero
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>void zero(cudaTask task, T* dst, size_t count)
- updates parameters of a memset task to a zero task
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>auto fill(T* dst, T value, size_t count) -> cudaTask
- creates a memset task that fills a typed memory block with a value
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>void fill(cudaTask task, T* dst, T value, size_t count)
- updates parameters of a memset task to a fill task
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>auto copy(T* tgt, const T* src, size_t num) -> cudaTask
- creates a memcopy task that copies typed data
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>void copy(cudaTask task, T* tgt, const T* src, size_t num)
- updates parameters of a memcpy task to a copy task
- auto instantiate() -> cudaGraphExec
- instantiates an executable graph from this cudaflow
-
template<typename C>auto single_task(C c) -> cudaTask
- runs a callable with only a single kernel thread
-
template<typename C>void single_task(cudaTask task, C c)
- updates a single-threaded kernel task
-
template<typename I, typename C>auto for_each(I first, I last, C callable) -> cudaTask
- applies a callable to each dereferenced element of the data array
-
template<typename I, typename C>void for_each(cudaTask task, I first, I last, C callable)
- updates parameters of a kernel task created from tf::
cudaFlow:: for_each -
template<typename I, typename C>auto for_each_index(I first, I last, I step, C callable) -> cudaTask
- applies a callable to each index in the range with the step size
-
template<typename I, typename C>void for_each_index(cudaTask task, I first, I last, I step, C callable)
- updates parameters of a kernel task created from tf::
cudaFlow:: for_each_index -
template<typename I, typename O, typename C>auto transform(I first, I last, O output, C op) -> cudaTask
- applies a callable to a source range and stores the result in a target range
-
template<typename I, typename O, typename C>void transform(cudaTask task, I first, I last, O output, C c)
- updates parameters of a kernel task created from tf::
cudaFlow:: transform -
template<typename I1, typename I2, typename O, typename C>auto transform(I1 first1, I1 last1, I2 first2, O output, C op) -> cudaTask
- creates a task to perform parallel transforms over two ranges of items
-
template<typename I1, typename I2, typename O, typename C>void transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c)
- updates parameters of a kernel task created from tf::
cudaFlow:: transform -
template<typename C>auto capture(C&& callable) -> cudaTask
- constructs a subflow graph through tf::
cudaFlowCapturer -
template<typename C>void capture(cudaTask task, C callable)
- updates the captured child graph
Function documentation
cudaTask tf:: cudaFlow:: noop()
creates a no-operation task
Returns | a tf:: |
---|
An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n
nodes with a barrier between them can be represented using an empty node and 2*n
dependency edges, rather than no empty node and n^2
dependency edges.
template<typename C>
cudaTask tf:: cudaFlow:: host(C&& callable,
void* user_data)
creates a host task that runs a callable on the host
Template parameters | |
---|---|
C | callable type |
Parameters | |
callable | a callable object with neither arguments nor return (i.e., constructible from std::function<void()> ) |
user_data | a pointer to the user data |
Returns | a tf:: |
A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc
).
template<typename C>
void tf:: cudaFlow:: host(cudaTask task,
C&& callable)
updates parameters of a host task
The method is similar to tf::
template<typename F, typename... ArgsT>
cudaTask tf:: cudaFlow:: kernel(dim3 g,
dim3 b,
size_t s,
F f,
ArgsT... args)
creates a kernel task
Template parameters | |
---|---|
F | kernel function type |
ArgsT | kernel function parameters type |
Parameters | |
g | configured grid |
b | configured block |
s | configured shared memory size in bytes |
f | kernel function |
args | arguments to forward to the kernel function by copy |
Returns | a tf:: |
template<typename F, typename... ArgsT>
void tf:: cudaFlow:: kernel(cudaTask task,
dim3 g,
dim3 b,
size_t shm,
F f,
ArgsT... args)
updates parameters of a kernel task
The method is similar to tf::
cudaTask tf:: cudaFlow:: memset(void* dst,
int v,
size_t count)
creates a memset task that fills untyped data with a byte value
Parameters | |
---|---|
dst | pointer to the destination device memory area |
v | value to set for each byte of specified memory |
count | size in bytes to set |
Returns | a tf:: |
A memset task fills the first count
bytes of device memory area pointed by dst
with the byte value v
.
void tf:: cudaFlow:: memset(cudaTask task,
void* dst,
int ch,
size_t count)
updates parameters of a memset task
The method is similar to tf::
cudaTask tf:: cudaFlow:: memcpy(void* tgt,
const void* src,
size_t bytes)
creates a memcpy task that copies untyped data in bytes
Parameters | |
---|---|
tgt | pointer to the target memory block |
src | pointer to the source memory block |
bytes | bytes to copy |
Returns | a tf:: |
A memcpy task transfers bytes
of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
void tf:: cudaFlow:: memcpy(cudaTask task,
void* tgt,
const void* src,
size_t bytes)
updates parameters of a memcpy task
The method is similar to tf::
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf:: cudaFlow:: zero(T* dst,
size_t count)
creates a memset task that sets a typed memory block to zero
Template parameters | |
---|---|
T | element type (size of T must be either 1, 2, or 4) |
Parameters | |
dst | pointer to the destination device memory area |
count | number of elements |
Returns | a tf:: |
A zero task zeroes the first count
elements of type T
in a device memory area pointed by dst
.
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
void tf:: cudaFlow:: zero(cudaTask task,
T* dst,
size_t count)
updates parameters of a memset task to a zero task
The method is similar to tf::
The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf:: cudaFlow:: fill(T* dst,
T value,
size_t count)
creates a memset task that fills a typed memory block with a value
Template parameters | |
---|---|
T | element type (size of T must be either 1, 2, or 4) |
Parameters | |
dst | pointer to the destination device memory area |
value | value to fill for each element of type T |
count | number of elements |
Returns | a tf:: |
A fill task fills the first count
elements of type T
with value
in a device memory area pointed by dst
. The value to fill is interpreted in type T
rather than byte.
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
void tf:: cudaFlow:: fill(cudaTask task,
T* dst,
T value,
size_t count)
updates parameters of a memset task to a fill task
The method is similar to tf::
The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf:: cudaFlow:: copy(T* tgt,
const T* src,
size_t num)
creates a memcopy task that copies typed data
Template parameters | |
---|---|
T | element type (non-void) |
Parameters | |
tgt | pointer to the target memory block |
src | pointer to the source memory block |
num | number of elements to copy |
Returns | a tf:: |
A copy task transfers num*sizeof(T)
bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
void tf:: cudaFlow:: copy(cudaTask task,
T* tgt,
const T* src,
size_t num)
updates parameters of a memcpy task to a copy task
The method is similar to tf::
template<typename C>
cudaTask tf:: cudaFlow:: single_task(C c)
runs a callable with only a single kernel thread
Template parameters | |
---|---|
C | callable type |
Parameters | |
c | callable to run by a single kernel thread |
Returns | a tf:: |
template<typename C>
void tf:: cudaFlow:: single_task(cudaTask task,
C c)
updates a single-threaded kernel task
This method is similar to cudaFlow::
template<typename I, typename C>
cudaTask tf:: cudaFlow:: for_each(I first,
I last,
C callable)
applies a callable to each dereferenced element of the data array
Template parameters | |
---|---|
I | iterator type |
C | callable type |
Parameters | |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
callable | a callable object to apply to the dereferenced iterator |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
for(auto itr = first; itr != last; itr++) { callable(*itr); }
template<typename I, typename C>
void tf:: cudaFlow:: for_each(cudaTask task,
I first,
I last,
C callable)
updates parameters of a kernel task created from tf::
The type of the iterators and the callable must be the same as the task created from tf::
template<typename I, typename C>
cudaTask tf:: cudaFlow:: for_each_index(I first,
I last,
I step,
C callable)
applies a callable to each index in the range with the step size
Template parameters | |
---|---|
I | index type |
C | callable type |
Parameters | |
first | beginning index |
last | last index |
step | step size |
callable | the callable to apply to each element in the data array |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
// step is positive [first, last) for(auto i=first; i<last; i+=step) { callable(i); } // step is negative [first, last) for(auto i=first; i>last; i+=step) { callable(i); }
template<typename I, typename C>
void tf:: cudaFlow:: for_each_index(cudaTask task,
I first,
I last,
I step,
C callable)
updates parameters of a kernel task created from tf::
The type of the iterators and the callable must be the same as the task created from tf::
template<typename I, typename O, typename C>
cudaTask tf:: cudaFlow:: transform(I first,
I last,
O output,
C op)
applies a callable to a source range and stores the result in a target range
Template parameters | |
---|---|
I | input iterator type |
O | output iterator type |
C | unary operator type |
Parameters | |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
op | the operator to apply to transform each element in the range |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *output++ = callable(*first++); }
template<typename I, typename O, typename C>
void tf:: cudaFlow:: transform(cudaTask task,
I first,
I last,
O output,
C c)
updates parameters of a kernel task created from tf::
The type of the iterators and the callable must be the same as the task created from tf::
template<typename I1, typename I2, typename O, typename C>
cudaTask tf:: cudaFlow:: transform(I1 first1,
I1 last1,
I2 first2,
O output,
C op)
creates a task to perform parallel transforms over two ranges of items
Template parameters | |
---|---|
I1 | first input iterator type |
I2 | second input iterator type |
O | output iterator type |
C | unary operator type |
Parameters | |
first1 | iterator to the beginning of the input range |
last1 | iterator to the end of the input range |
first2 | iterato |
output | iterator to the beginning of the output range |
op | binary operator to apply to transform each pair of items in the two input ranges |
Returns | cudaTask handle |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first1 != last1) { *output++ = op(*first1++, *first2++); }
template<typename I1, typename I2, typename O, typename C>
void tf:: cudaFlow:: transform(cudaTask task,
I1 first1,
I1 last1,
I2 first2,
O output,
C c)
updates parameters of a kernel task created from tf::
The type of the iterators and the callable must be the same as the task created from tf::
template<typename C>
cudaTask tf:: cudaFlow:: capture(C&& callable)
constructs a subflow graph through tf::
Template parameters | |
---|---|
C | callable type constructible from std::function<void(tf::cudaFlowCapturer&)> |
Parameters | |
callable | the callable to construct a capture flow |
Returns | a tf:: |
A captured subflow forms a sub-graph to the cudaFlow and can be used to capture custom (or third-party) kernels that cannot be directly constructed from the cudaFlow.
Example usage:
taskflow.emplace([&](tf::cudaFlow& cf){ tf::cudaTask my_kernel = cf.kernel(my_arguments); // create a flow capturer to capture custom kernels tf::cudaTask my_subflow = cf.capture([&](tf::cudaFlowCapturer& capturer){ capturer.on([&](cudaStream_t stream){ invoke_custom_kernel_with_stream(stream, custom_arguments); }); }); my_kernel.precede(my_subflow); });
template<typename C>
void tf:: cudaFlow:: capture(cudaTask task,
C callable)
updates the captured child graph
The method is similar to tf::