tf::cudaFlow class
#include <taskflow/cuda/cudaflow.hpp>

class to create a cudaFlow task dependency graph

A cudaFlow is a high-level interface over CUDA Graph to perform GPU operations using the task dependency graph model. The class provides a set of methods for creating and launch different tasks on one or multiple CUDA devices, for instance, kernel tasks, data transfer tasks, and memory operation tasks. The following example creates a cudaFlow of two kernel tasks, task1 and task2, where task1 runs before task2.

tf::Taskflow taskflow;
tf::Executor executor;

taskflow.emplace([&](tf::cudaFlow& cf){
  // create two kernel tasks
  tf::cudaTask task1 = cf.kernel(grid1, block1, shm_size1, kernel1, args1);
  tf::cudaTask task2 = cf.kernel(grid2, block2, shm_size2, kernel2, args2);

  // kernel1 runs before kernel2
  task1.precede(task2);
});

executor.run(taskflow).wait();

A cudaFlow is a task (tf::Task) created from tf::Taskflow and will be run by one worker thread in the executor. That is, the callable that describes a cudaFlow will be executed sequentially. Inside a cudaFlow task, different GPU tasks (tf::cudaTask) may run in parallel scheduled by the CUDA runtime.

Please refer to GPU Tasking (cudaFlow) for details.

Constructors, destructors, conversion operators

cudaFlow(): constructs a cudaFlow
~cudaFlow() defaulted: destroys the cudaFlow and its associated native CUDA graph and executable graph
cudaFlow(cudaFlow&&) defaulted: default move constructor

Public functions

auto operator=(cudaFlow&&) -> cudaFlow& defaulted: default move assignment operator
auto empty() const -> bool: queries the emptiness of the graph
auto num_tasks() const -> size_t: queries the number of tasks
void clear(): clears the cudaFlow object
void dump(std::ostream& os) const: dumps the cudaFlow graph into a DOT format through an output stream
void dump_native_graph(std::ostream& os) const: dumps the native CUDA graph into a DOT format through an output stream
auto noop() -> cudaTask: creates a no-operation task
template<typename C> auto host(C&& callable) -> cudaTask: creates a host task that runs a callable on the host
template<typename C> void host(cudaTask task, C&& callable): updates parameters of a host task
template<typename F, typename... ArgsT> auto kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args) -> cudaTask: creates a kernel task
template<typename F, typename... ArgsT> void kernel(cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args): updates parameters of a kernel task
auto memset(void* dst, int v, size_t count) -> cudaTask: creates a memset task that fills untyped data with a byte value
void memset(cudaTask task, void* dst, int ch, size_t count): updates parameters of a memset task
auto memcpy(void* tgt, const void* src, size_t bytes) -> cudaTask: creates a memcpy task that copies untyped data in bytes
void memcpy(cudaTask task, void* tgt, const void* src, size_t bytes): updates parameters of a memcpy task
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> auto zero(T* dst, size_t count) -> cudaTask: creates a memset task that sets a typed memory block to zero
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> void zero(cudaTask task, T* dst, size_t count): updates parameters of a memset task to a zero task
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> auto fill(T* dst, T value, size_t count) -> cudaTask: creates a memset task that fills a typed memory block with a value
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> void fill(cudaTask task, T* dst, T value, size_t count): updates parameters of a memset task to a fill task
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> auto copy(T* tgt, const T* src, size_t num) -> cudaTask: creates a memcopy task that copies typed data
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> void copy(cudaTask task, T* tgt, const T* src, size_t num): updates parameters of a memcpy task to a copy task
void run(cudaStream_t stream): offloads the cudaFlow onto a GPU asynchronously via a stream
auto native_graph() -> cudaGraph_t: acquires a reference to the underlying CUDA graph
auto native_executable() -> cudaGraphExec_t: acquires a reference to the underlying CUDA graph executable
template<typename C> auto single_task(C c) -> cudaTask: runs a callable with only a single kernel thread
template<typename C> void single_task(cudaTask task, C c): updates a single-threaded kernel task
template<typename I, typename C> auto for_each(I first, I last, C callable) -> cudaTask: applies a callable to each dereferenced element of the data array
template<typename I, typename C> void for_each(cudaTask task, I first, I last, C callable): updates parameters of a kernel task created from tf::cudaFlow::for_each
template<typename I, typename C> auto for_each_index(I first, I last, I step, C callable) -> cudaTask: applies a callable to each index in the range with the step size
template<typename I, typename C> void for_each_index(cudaTask task, I first, I last, I step, C callable): updates parameters of a kernel task created from tf::cudaFlow::for_each_index
template<typename I, typename O, typename C> auto transform(I first, I last, O output, C op) -> cudaTask: applies a callable to a source range and stores the result in a target range
template<typename I, typename O, typename C> void transform(cudaTask task, I first, I last, O output, C c): updates parameters of a kernel task created from tf::cudaFlow::transform
template<typename I1, typename I2, typename O, typename C> auto transform(I1 first1, I1 last1, I2 first2, O output, C op) -> cudaTask: creates a task to perform parallel transforms over two ranges of items
template<typename I1, typename I2, typename O, typename C> void transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c): updates parameters of a kernel task created from tf::cudaFlow::transform
template<typename C> auto capture(C&& callable) -> cudaTask: constructs a subflow graph through tf::cudaFlowCapturer
template<typename C> void capture(cudaTask task, C callable): updates the captured child graph

Function documentation

void tf::cudaFlow::dump_native_graph(std::ostream& os) const

dumps the native CUDA graph into a DOT format through an output stream

The native CUDA graph may be different from the upper-level cudaFlow graph when flow capture is involved.

cudaTask tf::cudaFlow::noop()

creates a no-operation task

Returns	a tf::cudaTask handle

An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.

template<typename C>
cudaTask tf::cudaFlow::host(C&& callable)

creates a host task that runs a callable on the host

Template parameters
C	callable type
Parameters
callable	a callable object with neither arguments nor return (i.e., constructible from `std::function<void()>`)
Returns	a tf::cudaTask handle

A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc).

template<typename C>
void tf::cudaFlow::host(cudaTask task, C&& callable)

updates parameters of a host task

The method is similar to tf::cudaFlow::host but operates on a task of type tf::cudaTaskType::HOST.

template<typename F, typename... ArgsT>
cudaTask tf::cudaFlow::kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)

creates a kernel task

Template parameters
F	kernel function type
ArgsT	kernel function parameters type
Parameters
g	configured grid
b	configured block
s	configured shared memory size in bytes
f	kernel function
args	arguments to forward to the kernel function by copy
Returns	a tf::cudaTask handle

template<typename F, typename... ArgsT>
void tf::cudaFlow::kernel(cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args)

updates parameters of a kernel task

The method is similar to tf::cudaFlow::kernel but operates on a task of type tf::cudaTaskType::KERNEL. The kernel function name must NOT change.

cudaTask tf::cudaFlow::memset(void* dst, int v, size_t count)

creates a memset task that fills untyped data with a byte value

Parameters
dst	pointer to the destination device memory area
v	value to set for each byte of specified memory
count	size in bytes to set
Returns	a tf::cudaTask handle

A memset task fills the first count bytes of device memory area pointed by dst with the byte value v.

void tf::cudaFlow::memset(cudaTask task, void* dst, int ch, size_t count)

updates parameters of a memset task

The method is similar to tf::cudaFlow::memset but operates on a task of type tf::cudaTaskType::MEMSET. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.

cudaTask tf::cudaFlow::memcpy(void* tgt, const void* src, size_t bytes)

creates a memcpy task that copies untyped data in bytes

Parameters
tgt	pointer to the target memory block
src	pointer to the source memory block
bytes	bytes to copy
Returns	a tf::cudaTask handle

A memcpy task transfers bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

void tf::cudaFlow::memcpy(cudaTask task, void* tgt, const void* src, size_t bytes)

updates parameters of a memcpy task

The method is similar to tf::cudaFlow::memcpy but operates on a task of type tf::cudaTaskType::MEMCPY. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.

template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf::cudaFlow::zero(T* dst, size_t count)

creates a memset task that sets a typed memory block to zero

Template parameters
T	element type (size of `T` must be either 1, 2, or 4)
Parameters
dst	pointer to the destination device memory area
count	number of elements
Returns	a tf::cudaTask handle

A zero task zeroes the first count elements of type T in a device memory area pointed by dst.

template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
void tf::cudaFlow::zero(cudaTask task, T* dst, size_t count)

updates parameters of a memset task to a zero task

The method is similar to tf::cudaFlow::zero but operates on a task of type tf::cudaTaskType::MEMSET.

The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.

template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf::cudaFlow::fill(T* dst, T value, size_t count)

creates a memset task that fills a typed memory block with a value

Template parameters
T	element type (size of `T` must be either 1, 2, or 4)
Parameters
dst	pointer to the destination device memory area
value	value to fill for each element of type `T`
count	number of elements
Returns	a tf::cudaTask handle

A fill task fills the first count elements of type T with value in a device memory area pointed by dst. The value to fill is interpreted in type T rather than byte.

template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
void tf::cudaFlow::fill(cudaTask task, T* dst, T value, size_t count)

updates parameters of a memset task to a fill task

The method is similar to tf::cudaFlow::fill but operates on a task of type tf::cudaTaskType::MEMSET.

The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf::cudaFlow::copy(T* tgt, const T* src, size_t num)

creates a memcopy task that copies typed data

Template parameters
T	element type (non-void)
Parameters
tgt	pointer to the target memory block
src	pointer to the source memory block
num	number of elements to copy
Returns	a tf::cudaTask handle

A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
void tf::cudaFlow::copy(cudaTask task, T* tgt, const T* src, size_t num)

updates parameters of a memcpy task to a copy task

The method is similar to tf::cudaFlow::copy but operates on a task of type tf::cudaTaskType::MEMCPY. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.

void tf::cudaFlow::run(cudaStream_t stream)

offloads the cudaFlow onto a GPU asynchronously via a stream

Parameters
stream	stream for performing this operation

Offloads the present cudaFlow onto a GPU asynchronously via the given stream.

An offloaded cudaFlow forces the underlying graph to be instantiated. After the instantiation, you should not modify the graph topology but update node parameters.

template<typename C>
cudaTask tf::cudaFlow::single_task(C c)

runs a callable with only a single kernel thread

Template parameters
C	callable type
Parameters
c	callable to run by a single kernel thread
Returns	a tf::cudaTask handle

template<typename C>
void tf::cudaFlow::single_task(cudaTask task, C c)

updates a single-threaded kernel task

This method is similar to cudaFlow::single_task but operates on an existing task.

template<typename I, typename C>
cudaTask tf::cudaFlow::for_each(I first, I last, C callable)

applies a callable to each dereferenced element of the data array

Template parameters
I	iterator type
C	callable type
Parameters
first	iterator to the beginning (inclusive)
last	iterator to the end (exclusive)
callable	a callable object to apply to the dereferenced iterator
Returns	a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; itr++) {
  callable(*itr);
}

template<typename I, typename C>
void tf::cudaFlow::for_each(cudaTask task, I first, I last, C callable)

updates parameters of a kernel task created from tf::cudaFlow::for_each

The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each.

template<typename I, typename C>
cudaTask tf::cudaFlow::for_each_index(I first, I last, I step, C callable)

applies a callable to each index in the range with the step size

Template parameters
I	index type
C	callable type
Parameters
first	beginning index
last	last index
step	step size
callable	the callable to apply to each element in the data array
Returns	a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

// step is positive [first, last)
for(auto i=first; i<last; i+=step) {
  callable(i);
}

// step is negative [first, last)
for(auto i=first; i>last; i+=step) {
  callable(i);
}

template<typename I, typename C>
void tf::cudaFlow::for_each_index(cudaTask task, I first, I last, I step, C callable)

updates parameters of a kernel task created from tf::cudaFlow::for_each_index

The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each_index.

template<typename I, typename O, typename C>
cudaTask tf::cudaFlow::transform(I first, I last, O output, C op)

applies a callable to a source range and stores the result in a target range

Template parameters
I	input iterator type
O	output iterator type
C	unary operator type
Parameters
first	iterator to the beginning of the input range
last	iterator to the end of the input range
output	iterator to the beginning of the output range
op	the operator to apply to transform each element in the range
Returns	a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *output++ = callable(*first++);
}

template<typename I, typename O, typename C>
void tf::cudaFlow::transform(cudaTask task, I first, I last, O output, C c)

updates parameters of a kernel task created from tf::cudaFlow::transform

The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each.

template<typename I1, typename I2, typename O, typename C>
cudaTask tf::cudaFlow::transform(I1 first1, I1 last1, I2 first2, O output, C op)

creates a task to perform parallel transforms over two ranges of items

Template parameters
I1	first input iterator type
I2	second input iterator type
O	output iterator type
C	unary operator type
Parameters
first1	iterator to the beginning of the input range
last1	iterator to the end of the input range
first2	iterato
output	iterator to the beginning of the output range
op	binary operator to apply to transform each pair of items in the two input ranges
Returns	cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {
  *output++ = op(*first1++, *first2++);
}

template<typename I1, typename I2, typename O, typename C>
void tf::cudaFlow::transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c)

updates parameters of a kernel task created from tf::cudaFlow::transform

The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each.

template<typename C>
cudaTask tf::cudaFlow::capture(C&& callable)

constructs a subflow graph through tf::cudaFlowCapturer

Template parameters
C	callable type constructible from `std::function<void(tf::cudaFlowCapturer&)>`
Parameters
callable	the callable to construct a capture flow
Returns	a tf::cudaTask handle

A captured subflow forms a sub-graph to the cudaFlow and can be used to capture custom (or third-party) kernels that cannot be directly constructed from the cudaFlow.

Example usage:

taskflow.emplace([&](tf::cudaFlow& cf){

  tf::cudaTask my_kernel = cf.kernel(my_arguments);

  // create a flow capturer to capture custom kernels
  tf::cudaTask my_subflow = cf.capture([&](tf::cudaFlowCapturer& capturer){
    capturer.on([&](cudaStream_t stream){
      invoke_custom_kernel_with_stream(stream, custom_arguments);
    });
  });

  my_kernel.precede(my_subflow);
});

template<typename C>
void tf::cudaFlow::capture(cudaTask task, C callable)

updates the captured child graph

The method is similar to tf::cudaFlow::capture but operates on a task of type tf::cudaTaskType::SUBFLOW. The new captured graph must be topologically identical to the original captured graph.

tf::cudaFlow class #include <taskflow/cuda/cudaflow.hpp>

Constructors, destructors, conversion operators

Public functions

Function documentation

void tf::cudaFlow::dump_native_graph(std::ostream& os) const

cudaTask tf::cudaFlow::noop()

template<typename C> cudaTask tf::cudaFlow::host(C&& callable)

template<typename C> void tf::cudaFlow::host(cudaTask task, C&& callable)

template<typename F, typename... ArgsT> cudaTask tf::cudaFlow::kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)

template<typename F, typename... ArgsT> void tf::cudaFlow::kernel(cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args)

cudaTask tf::cudaFlow::memset(void* dst, int v, size_t count)

void tf::cudaFlow::memset(cudaTask task, void* dst, int ch, size_t count)

cudaTask tf::cudaFlow::memcpy(void* tgt, const void* src, size_t bytes)

void tf::cudaFlow::memcpy(cudaTask task, void* tgt, const void* src, size_t bytes)

template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> cudaTask tf::cudaFlow::zero(T* dst, size_t count)

template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> void tf::cudaFlow::zero(cudaTask task, T* dst, size_t count)

template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> cudaTask tf::cudaFlow::fill(T* dst, T value, size_t count)

template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> void tf::cudaFlow::fill(cudaTask task, T* dst, T value, size_t count)

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> cudaTask tf::cudaFlow::copy(T* tgt, const T* src, size_t num)

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> void tf::cudaFlow::copy(cudaTask task, T* tgt, const T* src, size_t num)

void tf::cudaFlow::run(cudaStream_t stream)

template<typename C> cudaTask tf::cudaFlow::single_task(C c)

template<typename C> void tf::cudaFlow::single_task(cudaTask task, C c)

template<typename I, typename C> cudaTask tf::cudaFlow::for_each(I first, I last, C callable)

template<typename I, typename C> void tf::cudaFlow::for_each(cudaTask task, I first, I last, C callable)

template<typename I, typename C> cudaTask tf::cudaFlow::for_each_index(I first, I last, I step, C callable)

template<typename I, typename C> void tf::cudaFlow::for_each_index(cudaTask task, I first, I last, I step, C callable)

template<typename I, typename O, typename C> cudaTask tf::cudaFlow::transform(I first, I last, O output, C op)

template<typename I, typename O, typename C> void tf::cudaFlow::transform(cudaTask task, I first, I last, O output, C c)

template<typename I1, typename I2, typename O, typename C> cudaTask tf::cudaFlow::transform(I1 first1, I1 last1, I2 first2, O output, C op)

template<typename I1, typename I2, typename O, typename C> void tf::cudaFlow::transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c)

template<typename C> cudaTask tf::cudaFlow::capture(C&& callable)

template<typename C> void tf::cudaFlow::capture(cudaTask task, C callable)

tf::cudaFlow class
#include <taskflow/cuda/cudaflow.hpp>

template<typename C>
cudaTask tf::cudaFlow::host(C&& callable)

template<typename C>
void tf::cudaFlow::host(cudaTask task, C&& callable)

template<typename F, typename... ArgsT>
cudaTask tf::cudaFlow::kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)

template<typename F, typename... ArgsT>
void tf::cudaFlow::kernel(cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args)

template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf::cudaFlow::zero(T* dst, size_t count)

template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
void tf::cudaFlow::zero(cudaTask task, T* dst, size_t count)

template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf::cudaFlow::fill(T* dst, T value, size_t count)

template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
void tf::cudaFlow::fill(cudaTask task, T* dst, T value, size_t count)

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf::cudaFlow::copy(T* tgt, const T* src, size_t num)

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
void tf::cudaFlow::copy(cudaTask task, T* tgt, const T* src, size_t num)

template<typename C>
cudaTask tf::cudaFlow::single_task(C c)

template<typename C>
void tf::cudaFlow::single_task(cudaTask task, C c)

template<typename I, typename C>
cudaTask tf::cudaFlow::for_each(I first, I last, C callable)

template<typename I, typename C>
void tf::cudaFlow::for_each(cudaTask task, I first, I last, C callable)

template<typename I, typename C>
cudaTask tf::cudaFlow::for_each_index(I first, I last, I step, C callable)

template<typename I, typename C>
void tf::cudaFlow::for_each_index(cudaTask task, I first, I last, I step, C callable)

template<typename I, typename O, typename C>
cudaTask tf::cudaFlow::transform(I first, I last, O output, C op)

template<typename I, typename O, typename C>
void tf::cudaFlow::transform(cudaTask task, I first, I last, O output, C c)

template<typename I1, typename I2, typename O, typename C>
cudaTask tf::cudaFlow::transform(I1 first1, I1 last1, I2 first2, O output, C op)

template<typename I1, typename I2, typename O, typename C>
void tf::cudaFlow::transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c)

template<typename C>
cudaTask tf::cudaFlow::capture(C&& callable)

template<typename C>
void tf::cudaFlow::capture(cudaTask task, C callable)