tf::cudaFlowCapturer class
#include <taskflow/cuda/cuda_capturer.hpp>

class to create a cudaFlow graph using stream capture

The usage of tf::cudaFlowCapturer is similar to tf::cudaFlow, except users can call the method tf::cudaFlowCapturer::on to capture a sequence of asynchronous CUDA operations through the given stream. The following example creates a CUDA graph that captures two kernel tasks, task_1 and task_2, where task_1 runs before task_2.

taskflow.emplace([](tf::cudaFlowCapturer& capturer){

  // capture my_kernel_1 through the given stream managed by the capturer
  auto task_1 = capturer.on([&](cudaStream_t stream){
    my_kernel_1<<<grid_1, block_1, shm_size_1, stream>>>(my_parameters_1);
  });

  // capture my_kernel_2 through the given stream managed by the capturer
  auto task_2 = capturer.on([&](cudaStream_t stream){
    my_kernel_2<<<grid_2, block_2, shm_size_2, stream>>>(my_parameters_2);
  });

  task_1.precede(task_2);
});

Similar to tf::cudaFlow, a cudaFlowCapturer is a task (tf::Task) created from tf::Taskflow and will be run by one worker thread in the executor. That is, the callable that describes a cudaFlowCapturer will be executed sequentially. Inside a cudaFlow capturer task, different GPU tasks (tf::cudaTask) may run in parallel depending on the selected optimization algorithm. By default, we use tf::cudaFlowRoundRobinOptimizer to transform a user-level graph into a native CUDA graph.

Please refer to GPU Tasking (cudaFlowCapturer) for details.

Constructors, destructors, conversion operators

cudaFlowCapturer() defaulted: constructs a standalone cudaFlowCapturer
~cudaFlowCapturer() defaulted: destructs the cudaFlowCapturer
cudaFlowCapturer(cudaFlowCapturer&&) defaulted: default move constructor

Public functions

auto operator=(cudaFlowCapturer&&) -> cudaFlowCapturer& defaulted: default move assignment operator
auto empty() const -> bool: queries the emptiness of the graph
auto num_tasks() const -> size_t: queries the number of tasks
void clear(): clear this cudaFlow capturer
void dump(std::ostream& os) const: dumps the cudaFlow graph into a DOT format through an output stream
void dump_native_graph(std::ostream& os) const: dumps the native captured graph into a DOT format through an output stream
template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr> auto on(C&& callable) -> cudaTask: captures a sequential CUDA operations from the given callable
template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr> void on(cudaTask task, C&& callable): updates a capture task to another sequential CUDA operations
auto noop() -> cudaTask: captures a no-operation task
void noop(cudaTask task): updates a task to a no-operation task
auto memcpy(void* dst, const void* src, size_t count) -> cudaTask: copies data between host and device asynchronously through a stream
void memcpy(cudaTask task, void* dst, const void* src, size_t count): updates a capture task to a memcpy operation
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> auto copy(T* tgt, const T* src, size_t num) -> cudaTask: captures a copy task of typed data
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> void copy(cudaTask task, T* tgt, const T* src, size_t num): updates a capture task to a copy operation
auto memset(void* ptr, int v, size_t n) -> cudaTask: initializes or sets GPU memory to the given value byte by byte
void memset(cudaTask task, void* ptr, int value, size_t n): updates a capture task to a memset operation
template<typename F, typename... ArgsT> auto kernel(dim3 g, dim3 b, size_t s, F f, ArgsT && ... args) -> cudaTask: captures a kernel
template<typename F, typename... ArgsT> void kernel(cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT && ... args): updates a capture task to a kernel operation
template<typename C> auto single_task(C c) -> cudaTask: capturers a kernel to runs the given callable with only one thread
template<typename C> void single_task(cudaTask task, C c): updates a capture task to a single-threaded kernel
template<typename I, typename C> auto for_each(I first, I last, C callable) -> cudaTask: captures a kernel that applies a callable to each dereferenced element of the data array
template<typename I, typename C> void for_each(cudaTask task, I first, I last, C callable): updates a capture task to a for-each kernel task
template<typename I, typename C> auto for_each_index(I first, I last, I step, C callable) -> cudaTask: captures a kernel that applies a callable to each index in the range with the step size
template<typename I, typename C> void for_each_index(cudaTask task, I first, I last, I step, C callable): updates a capture task to a for-each-index kernel task
template<typename I, typename O, typename C> auto transform(I first, I last, O output, C op) -> cudaTask: captures a kernel that transforms an input range to an output range
template<typename I, typename O, typename C> void transform(cudaTask task, I first, I last, O output, C op): updates a capture task to a transform kernel task
template<typename I1, typename I2, typename O, typename C> auto transform(I1 first1, I1 last1, I2 first2, O output, C op) -> cudaTask: captures a kernel that transforms two input ranges to an output range
template<typename I1, typename I2, typename O, typename C> void transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op): updates a capture task to a transform kernel task
template<typename OPT, typename... ArgsT> void make_optimizer(ArgsT && ... args): selects a different optimization algorithm
auto capture() -> cudaGraph_t: captures the cudaFlow and turns it into a CUDA Graph
void run(cudaStream_t stream): offloads the cudaFlowCapturer onto a GPU asynchronously via a stream
auto native_graph() -> cudaGraph_t: acquires a reference to the underlying CUDA graph
auto instantiate() -> cudaGraphExec: instantiates an executable graph from this cudaflow capturer

Function documentation

tf::cudaFlowCapturer::cudaFlowCapturer() defaulted

constructs a standalone cudaFlowCapturer

A standalone cudaFlow capturer does not go through any taskflow and can be run by the caller thread using tf::cudaFlowCapturer::run.

template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr>
cudaTask tf::cudaFlowCapturer::on(C&& callable)

captures a sequential CUDA operations from the given callable

Template parameters
C	callable type constructible with `std::function<void(cudaStream_t)>`
Parameters
callable	a callable to capture CUDA operations with the stream

This methods applies a stream created by the flow to capture a sequence of CUDA operations defined in the callable.

template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr>
void tf::cudaFlowCapturer::on(cudaTask task, C&& callable)

updates a capture task to another sequential CUDA operations

The method is similar to cudaFlowCapturer::on but operates on an existing task.

cudaTask tf::cudaFlowCapturer::noop()

captures a no-operation task

Returns	a tf::cudaTask handle

An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.

void tf::cudaFlowCapturer::noop(cudaTask task)

updates a task to a no-operation task

The method is similar to tf::cudaFlowCapturer::noop but operates on an existing task.

cudaTask tf::cudaFlowCapturer::memcpy(void* dst, const void* src, size_t count)

copies data between host and device asynchronously through a stream

Parameters
dst	destination memory address
src	source memory address
count	size in bytes to copy

The method captures a cudaMemcpyAsync operation through an internal stream.

void tf::cudaFlowCapturer::memcpy(cudaTask task, void* dst, const void* src, size_t count)

updates a capture task to a memcpy operation

The method is similar to cudaFlowCapturer::memcpy but operates on an existing task.

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf::cudaFlowCapturer::copy(T* tgt, const T* src, size_t num)

captures a copy task of typed data

Template parameters
T	element type (non-void)
Parameters
tgt	pointer to the target memory block
src	pointer to the source memory block
num	number of elements to copy
Returns	cudaTask handle

A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
void tf::cudaFlowCapturer::copy(cudaTask task, T* tgt, const T* src, size_t num)

updates a capture task to a copy operation

The method is similar to cudaFlowCapturer::copy but operates on an existing task.

cudaTask tf::cudaFlowCapturer::memset(void* ptr, int v, size_t n)

initializes or sets GPU memory to the given value byte by byte

Parameters
ptr	pointer to GPU memory
v	value to set for each byte of the specified memory
n	size in bytes to set

The method captures a cudaMemsetAsync operation through an internal stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

void tf::cudaFlowCapturer::memset(cudaTask task, void* ptr, int value, size_t n)

updates a capture task to a memset operation

The method is similar to cudaFlowCapturer::memset but operates on an existing task.

template<typename F, typename... ArgsT>
cudaTask tf::cudaFlowCapturer::kernel(dim3 g, dim3 b, size_t s, F f, ArgsT && ... args)

captures a kernel

Template parameters
F	kernel function type
ArgsT	kernel function parameters type
Parameters
g	configured grid
b	configured block
s	configured shared memory size in bytes
f	kernel function
args	arguments to forward to the kernel function by copy
Returns	cudaTask handle

template<typename F, typename... ArgsT>
void tf::cudaFlowCapturer::kernel(cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT && ... args)

updates a capture task to a kernel operation

The method is similar to cudaFlowCapturer::kernel but operates on an existing task.

template<typename C>
cudaTask tf::cudaFlowCapturer::single_task(C c)

capturers a kernel to runs the given callable with only one thread

Template parameters
C	callable type
Parameters
c	callable to run by a single kernel thread

template<typename C>
void tf::cudaFlowCapturer::single_task(cudaTask task, C c)

updates a capture task to a single-threaded kernel

This method is similar to cudaFlowCapturer::single_task but operates on an existing task.

template<typename I, typename C>
cudaTask tf::cudaFlowCapturer::for_each(I first, I last, C callable)

captures a kernel that applies a callable to each dereferenced element of the data array

Template parameters
I	iterator type
C	callable type
Parameters
first	iterator to the beginning
last	iterator to the end
callable	a callable object to apply to the dereferenced iterator
Returns	cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; i++) {
  callable(*itr);
}

template<typename I, typename C>
void tf::cudaFlowCapturer::for_each(cudaTask task, I first, I last, C callable)

updates a capture task to a for-each kernel task

This method is similar to cudaFlowCapturer::for_each but operates on an existing task.

template<typename I, typename C>
cudaTask tf::cudaFlowCapturer::for_each_index(I first, I last, I step, C callable)

captures a kernel that applies a callable to each index in the range with the step size

Template parameters
I	index type
C	callable type
Parameters
first	beginning index
last	last index
step	step size
callable	the callable to apply to each element in the data array
Returns	cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

// step is positive [first, last)
for(auto i=first; i<last; i+=step) {
  callable(i);
}

// step is negative [first, last)
for(auto i=first; i>last; i+=step) {
  callable(i);
}

template<typename I, typename C>
void tf::cudaFlowCapturer::for_each_index(cudaTask task, I first, I last, I step, C callable)

updates a capture task to a for-each-index kernel task

This method is similar to cudaFlowCapturer::for_each_index but operates on an existing task.

template<typename I, typename O, typename C>
cudaTask tf::cudaFlowCapturer::transform(I first, I last, O output, C op)

captures a kernel that transforms an input range to an output range

Template parameters
I	input iterator type
O	output iterator type
C	unary operator type
Parameters
first	iterator to the beginning of the input range
last	iterator to the end of the input range
output	iterator to the beginning of the output range
op	unary operator to apply to transform each item in the range
Returns	cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *output++ = op(*first++);
}

template<typename I, typename O, typename C>
void tf::cudaFlowCapturer::transform(cudaTask task, I first, I last, O output, C op)

updates a capture task to a transform kernel task

This method is similar to cudaFlowCapturer::transform but operates on an existing task.

template<typename I1, typename I2, typename O, typename C>
cudaTask tf::cudaFlowCapturer::transform(I1 first1, I1 last1, I2 first2, O output, C op)

captures a kernel that transforms two input ranges to an output range

Template parameters
I1	first input iterator type
I2	second input iterator type
O	output iterator type
C	unary operator type
Parameters
first1	iterator to the beginning of the input range
last1	iterator to the end of the input range
first2	iterato
output	iterator to the beginning of the output range
op	binary operator to apply to transform each pair of items in the two input ranges
Returns	cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {
  *output++ = op(*first1++, *first2++);
}

template<typename I1, typename I2, typename O, typename C>
void tf::cudaFlowCapturer::transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op)

updates a capture task to a transform kernel task

This method is similar to cudaFlowCapturer::transform but operates on an existing task.

template<typename OPT, typename... ArgsT>
void tf::cudaFlowCapturer::make_optimizer(ArgsT && ... args)

selects a different optimization algorithm

Template parameters
OPT	optimizer type
ArgsT	arguments types
Parameters
args	arguments to forward to construct the optimizer
Returns	a reference to the optimizer

We currently supports the following optimization algorithms to capture a user-described cudaFlow:

By default, tf::cudaFlowCapturer uses the round-robin optimization algorithm with four streams to transform a user-level graph into a native CUDA graph.

void tf::cudaFlowCapturer::run(cudaStream_t stream)

offloads the cudaFlowCapturer onto a GPU asynchronously via a stream

Parameters
stream	stream for performing this operation

Offloads the present cudaFlowCapturer onto a GPU asynchronously via the given stream.

An offloaded cudaFlowCapturer forces the underlying graph to be instantiated. After the instantiation, you should not modify the graph topology but update node parameters.

tf::cudaFlowCapturer class #include <taskflow/cuda/cuda_capturer.hpp>

Constructors, destructors, conversion operators

Public functions

Function documentation

tf::cudaFlowCapturer::cudaFlowCapturer() defaulted

template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr> cudaTask tf::cudaFlowCapturer::on(C&& callable)

template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr> void tf::cudaFlowCapturer::on(cudaTask task, C&& callable)

cudaTask tf::cudaFlowCapturer::noop()

void tf::cudaFlowCapturer::noop(cudaTask task)

cudaTask tf::cudaFlowCapturer::memcpy(void* dst, const void* src, size_t count)

void tf::cudaFlowCapturer::memcpy(cudaTask task, void* dst, const void* src, size_t count)

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> cudaTask tf::cudaFlowCapturer::copy(T* tgt, const T* src, size_t num)

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> void tf::cudaFlowCapturer::copy(cudaTask task, T* tgt, const T* src, size_t num)

cudaTask tf::cudaFlowCapturer::memset(void* ptr, int v, size_t n)

void tf::cudaFlowCapturer::memset(cudaTask task, void* ptr, int value, size_t n)

template<typename F, typename... ArgsT> cudaTask tf::cudaFlowCapturer::kernel(dim3 g, dim3 b, size_t s, F f, ArgsT && ... args)

template<typename F, typename... ArgsT> void tf::cudaFlowCapturer::kernel(cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT && ... args)

template<typename C> cudaTask tf::cudaFlowCapturer::single_task(C c)

template<typename C> void tf::cudaFlowCapturer::single_task(cudaTask task, C c)

template<typename I, typename C> cudaTask tf::cudaFlowCapturer::for_each(I first, I last, C callable)

template<typename I, typename C> void tf::cudaFlowCapturer::for_each(cudaTask task, I first, I last, C callable)

template<typename I, typename C> cudaTask tf::cudaFlowCapturer::for_each_index(I first, I last, I step, C callable)

template<typename I, typename C> void tf::cudaFlowCapturer::for_each_index(cudaTask task, I first, I last, I step, C callable)

template<typename I, typename O, typename C> cudaTask tf::cudaFlowCapturer::transform(I first, I last, O output, C op)

template<typename I, typename O, typename C> void tf::cudaFlowCapturer::transform(cudaTask task, I first, I last, O output, C op)

template<typename I1, typename I2, typename O, typename C> cudaTask tf::cudaFlowCapturer::transform(I1 first1, I1 last1, I2 first2, O output, C op)

template<typename I1, typename I2, typename O, typename C> void tf::cudaFlowCapturer::transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op)

template<typename OPT, typename... ArgsT> void tf::cudaFlowCapturer::make_optimizer(ArgsT && ... args)

void tf::cudaFlowCapturer::run(cudaStream_t stream)

tf::cudaFlowCapturer class
#include <taskflow/cuda/cuda_capturer.hpp>

template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr>
cudaTask tf::cudaFlowCapturer::on(C&& callable)

template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr>
void tf::cudaFlowCapturer::on(cudaTask task, C&& callable)

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf::cudaFlowCapturer::copy(T* tgt, const T* src, size_t num)

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
void tf::cudaFlowCapturer::copy(cudaTask task, T* tgt, const T* src, size_t num)

template<typename F, typename... ArgsT>
cudaTask tf::cudaFlowCapturer::kernel(dim3 g, dim3 b, size_t s, F f, ArgsT && ... args)

template<typename F, typename... ArgsT>
void tf::cudaFlowCapturer::kernel(cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT && ... args)

template<typename C>
cudaTask tf::cudaFlowCapturer::single_task(C c)

template<typename C>
void tf::cudaFlowCapturer::single_task(cudaTask task, C c)

template<typename I, typename C>
cudaTask tf::cudaFlowCapturer::for_each(I first, I last, C callable)

template<typename I, typename C>
void tf::cudaFlowCapturer::for_each(cudaTask task, I first, I last, C callable)

template<typename I, typename C>
cudaTask tf::cudaFlowCapturer::for_each_index(I first, I last, I step, C callable)

template<typename I, typename C>
void tf::cudaFlowCapturer::for_each_index(cudaTask task, I first, I last, I step, C callable)

template<typename I, typename O, typename C>
cudaTask tf::cudaFlowCapturer::transform(I first, I last, O output, C op)

template<typename I, typename O, typename C>
void tf::cudaFlowCapturer::transform(cudaTask task, I first, I last, O output, C op)

template<typename I1, typename I2, typename O, typename C>
cudaTask tf::cudaFlowCapturer::transform(I1 first1, I1 last1, I2 first2, O output, C op)

template<typename I1, typename I2, typename O, typename C>
void tf::cudaFlowCapturer::transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op)

template<typename OPT, typename... ArgsT>
void tf::cudaFlowCapturer::make_optimizer(ArgsT && ... args)