template<typename Creator, typename Deleter>
tf::cudaGraphBase class

class to create a CUDA graph with uunique ownership

Template parameters
Creator	functor to create the stream (used in constructor)
Deleter	functor to delete the stream (used in destructor)

This class wraps a cudaGraph_t handle with std::unique_ptr to ensure proper resource management and automatic cleanup.

Public types

using base_type = std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, Deleter>: base std::unique_ptr type

Constructors, destructors, conversion operators

template<typename... ArgsT> cudaGraphBase(ArgsT && ... args) explicit: constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator
cudaGraphBase(cudaGraphBase&&) defaulted: constructs a cudaGraph from the given rhs using move semantics

Public functions

auto operator=(cudaGraphBase&&) -> cudaGraphBase& defaulted: assign the rhs to *this using move semantics
auto num_nodes() const -> size_t: queries the number of nodes in a native CUDA graph
auto num_edges() const -> size_t: queries the number of edges in a native CUDA graph
auto empty() const -> bool: queries if the graph is empty
void dump(std::ostream& os): dumps the CUDA graph to a DOT format through the given output stream
auto noop() -> cudaTask: creates a no-operation task
template<typename C> auto host(C&& callable, void* user_data) -> cudaTask: creates a host task that runs a callable on the host
template<typename F, typename... ArgsT> auto kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args) -> cudaTask: creates a kernel task
auto memset(void* dst, int v, size_t count) -> cudaTask: creates a memset task that fills untyped data with a byte value
auto memcpy(void* tgt, const void* src, size_t bytes) -> cudaTask: creates a memcpy task that copies untyped data in bytes
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> auto zero(T* dst, size_t count) -> cudaTask: creates a memset task that sets a typed memory block to zero
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> auto fill(T* dst, T value, size_t count) -> cudaTask: creates a memset task that fills a typed memory block with a value
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> auto copy(T* tgt, const T* src, size_t num) -> cudaTask: creates a memcopy task that copies typed data
template<typename C> auto single_task(C c) -> cudaTask: runs a callable with only a single kernel thread
template<typename I, typename C, typename E = cudaDefaultExecutionPolicy> auto for_each(I first, I last, C callable) -> cudaTask: applies a callable to each dereferenced element of the data array
template<typename I, typename C, typename E = cudaDefaultExecutionPolicy> auto for_each_index(I first, I last, I step, C callable) -> cudaTask: applies a callable to each index in the range with the step size
template<typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy> auto transform(I first, I last, O output, C op) -> cudaTask: applies a callable to a source range and stores the result in a target range
template<typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy> auto transform(I1 first1, I1 last1, I2 first2, O output, C op) -> cudaTask: creates a task to perform parallel transforms over two ranges of items

Function documentation

template<typename Creator, typename Deleter> template<typename... ArgsT>
tf::cudaGraphBase<Creator, Deleter>::cudaGraphBase(ArgsT && ... args) explicit

constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator

Parameters
args	arguments to pass to the executable CUDA graph creator

Constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator

template<typename Creator, typename Deleter>
void tf::cudaGraphBase<Creator, Deleter>::dump(std::ostream& os)

dumps the CUDA graph to a DOT format through the given output stream

Parameters
os	target output stream

template<typename Creator, typename Deleter>
cudaTask tf::cudaGraphBase<Creator, Deleter>::noop()

creates a no-operation task

Returns	a tf::cudaTask handle

An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.

template<typename Creator, typename Deleter> template<typename C>
cudaTask tf::cudaGraphBase<Creator, Deleter>::host(C&& callable, void* user_data)

creates a host task that runs a callable on the host

Template parameters
C	callable type
Parameters
callable	a callable object with neither arguments nor return (i.e., constructible from `std::function<void()>`)
user_data	a pointer to the user data
Returns	a tf::cudaTask handle

A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc).

template<typename Creator, typename Deleter> template<typename F, typename... ArgsT>
cudaTask tf::cudaGraphBase<Creator, Deleter>::kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)

creates a kernel task

Template parameters
F	kernel function type
ArgsT	kernel function parameters type
Parameters
g	configured grid
b	configured block
s	configured shared memory size in bytes
f	kernel function
args	arguments to forward to the kernel function by copy
Returns	a tf::cudaTask handle

template<typename Creator, typename Deleter>
cudaTask tf::cudaGraphBase<Creator, Deleter>::memset(void* dst, int v, size_t count)

creates a memset task that fills untyped data with a byte value

Parameters
dst	pointer to the destination device memory area
v	value to set for each byte of specified memory
count	size in bytes to set
Returns	a tf::cudaTask handle

A memset task fills the first count bytes of device memory area pointed by dst with the byte value v.

template<typename Creator, typename Deleter>
cudaTask tf::cudaGraphBase<Creator, Deleter>::memcpy(void* tgt, const void* src, size_t bytes)

creates a memcpy task that copies untyped data in bytes

Parameters
tgt	pointer to the target memory block
src	pointer to the source memory block
bytes	bytes to copy
Returns	a tf::cudaTask handle

A memcpy task transfers bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf::cudaGraphBase<Creator, Deleter>::zero(T* dst, size_t count)

creates a memset task that sets a typed memory block to zero

Template parameters
T	element type (size of `T` must be either 1, 2, or 4)
Parameters
dst	pointer to the destination device memory area
count	number of elements
Returns	a tf::cudaTask handle

A zero task zeroes the first count elements of type T in a device memory area pointed by dst.

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf::cudaGraphBase<Creator, Deleter>::fill(T* dst, T value, size_t count)

creates a memset task that fills a typed memory block with a value

Template parameters
T	element type (size of `T` must be either 1, 2, or 4)
Parameters
dst	pointer to the destination device memory area
value	value to fill for each element of type `T`
count	number of elements
Returns	a tf::cudaTask handle

A fill task fills the first count elements of type T with value in a device memory area pointed by dst. The value to fill is interpreted in type T rather than byte.

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf::cudaGraphBase<Creator, Deleter>::copy(T* tgt, const T* src, size_t num)

creates a memcopy task that copies typed data

Template parameters
T	element type (non-void)
Parameters
tgt	pointer to the target memory block
src	pointer to the source memory block
num	number of elements to copy
Returns	a tf::cudaTask handle

A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

template<typename Creator, typename Deleter> template<typename C>
cudaTask tf::cudaGraphBase<Creator, Deleter>::single_task(C c)

runs a callable with only a single kernel thread

Template parameters
C	callable type
Parameters
c	callable to run by a single kernel thread
Returns	a tf::cudaTask handle

template<typename Creator, typename Deleter> template<typename I, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf::cudaGraphBase<Creator, Deleter>::for_each(I first, I last, C callable)

applies a callable to each dereferenced element of the data array

Template parameters
I	iterator type
C	callable type
E	execution poligy (default tf::cudaDefaultExecutionPolicy)
Parameters
first	iterator to the beginning (inclusive)
last	iterator to the end (exclusive)
callable	a callable object to apply to the dereferenced iterator
Returns	a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; itr++) {
  callable(*itr);
}

template<typename Creator, typename Deleter> template<typename I, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf::cudaGraphBase<Creator, Deleter>::for_each_index(I first, I last, I step, C callable)

applies a callable to each index in the range with the step size

Template parameters
I	index type
C	callable type
E	execution poligy (default tf::cudaDefaultExecutionPolicy)
Parameters
first	beginning index
last	last index
step	step size
callable	the callable to apply to each element in the data array
Returns	a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

// step is positive [first, last)
for(auto i=first; i<last; i+=step) {
  callable(i);
}

// step is negative [first, last)
for(auto i=first; i>last; i+=step) {
  callable(i);
}

template<typename Creator, typename Deleter> template<typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf::cudaGraphBase<Creator, Deleter>::transform(I first, I last, O output, C op)

applies a callable to a source range and stores the result in a target range

Template parameters
I	input iterator type
O	output iterator type
C	unary operator type
E	execution poligy (default tf::cudaDefaultExecutionPolicy)
Parameters
first	iterator to the beginning of the input range
last	iterator to the end of the input range
output	iterator to the beginning of the output range
op	the operator to apply to transform each element in the range
Returns	a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *output++ = callable(*first++);
}

template<typename Creator, typename Deleter> template<typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf::cudaGraphBase<Creator, Deleter>::transform(I1 first1, I1 last1, I2 first2, O output, C op)

creates a task to perform parallel transforms over two ranges of items

Template parameters
I1	first input iterator type
I2	second input iterator type
O	output iterator type
C	unary operator type
E	execution poligy (default tf::cudaDefaultExecutionPolicy)
Parameters
first1	iterator to the beginning of the input range
last1	iterator to the end of the input range
first2	iterato
output	iterator to the beginning of the output range
op	binary operator to apply to transform each pair of items in the two input ranges
Returns	cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {
  *output++ = op(*first1++, *first2++);
}

template<typename Creator, typename Deleter> tf::cudaGraphBase class

Public types

Constructors, destructors, conversion operators

Public functions

Function documentation

template<typename Creator, typename Deleter> template<typename... ArgsT> tf::cudaGraphBase<Creator, Deleter>::cudaGraphBase(ArgsT && ... args) explicit

template<typename Creator, typename Deleter> void tf::cudaGraphBase<Creator, Deleter>::dump(std::ostream& os)

template<typename Creator, typename Deleter> cudaTask tf::cudaGraphBase<Creator, Deleter>::noop()

template<typename Creator, typename Deleter> template<typename C> cudaTask tf::cudaGraphBase<Creator, Deleter>::host(C&& callable, void* user_data)

template<typename Creator, typename Deleter> template<typename F, typename... ArgsT> cudaTask tf::cudaGraphBase<Creator, Deleter>::kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)

template<typename Creator, typename Deleter> cudaTask tf::cudaGraphBase<Creator, Deleter>::memset(void* dst, int v, size_t count)

template<typename Creator, typename Deleter> cudaTask tf::cudaGraphBase<Creator, Deleter>::memcpy(void* tgt, const void* src, size_t bytes)

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> cudaTask tf::cudaGraphBase<Creator, Deleter>::zero(T* dst, size_t count)

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> cudaTask tf::cudaGraphBase<Creator, Deleter>::fill(T* dst, T value, size_t count)

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> cudaTask tf::cudaGraphBase<Creator, Deleter>::copy(T* tgt, const T* src, size_t num)

template<typename Creator, typename Deleter> template<typename C> cudaTask tf::cudaGraphBase<Creator, Deleter>::single_task(C c)

template<typename Creator, typename Deleter> template<typename I, typename C, typename E = cudaDefaultExecutionPolicy> cudaTask tf::cudaGraphBase<Creator, Deleter>::for_each(I first, I last, C callable)

template<typename Creator, typename Deleter> template<typename I, typename C, typename E = cudaDefaultExecutionPolicy> cudaTask tf::cudaGraphBase<Creator, Deleter>::for_each_index(I first, I last, I step, C callable)

template<typename Creator, typename Deleter> template<typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy> cudaTask tf::cudaGraphBase<Creator, Deleter>::transform(I first, I last, O output, C op)

template<typename Creator, typename Deleter> template<typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy> cudaTask tf::cudaGraphBase<Creator, Deleter>::transform(I1 first1, I1 last1, I2 first2, O output, C op)

template<typename Creator, typename Deleter>
tf::cudaGraphBase class

template<typename Creator, typename Deleter> template<typename... ArgsT>
tf::cudaGraphBase<Creator, Deleter>::cudaGraphBase(ArgsT && ... args) explicit

template<typename Creator, typename Deleter>
void tf::cudaGraphBase<Creator, Deleter>::dump(std::ostream& os)

template<typename Creator, typename Deleter>
cudaTask tf::cudaGraphBase<Creator, Deleter>::noop()

template<typename Creator, typename Deleter> template<typename C>
cudaTask tf::cudaGraphBase<Creator, Deleter>::host(C&& callable, void* user_data)

template<typename Creator, typename Deleter> template<typename F, typename... ArgsT>
cudaTask tf::cudaGraphBase<Creator, Deleter>::kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)

template<typename Creator, typename Deleter>
cudaTask tf::cudaGraphBase<Creator, Deleter>::memset(void* dst, int v, size_t count)

template<typename Creator, typename Deleter>
cudaTask tf::cudaGraphBase<Creator, Deleter>::memcpy(void* tgt, const void* src, size_t bytes)

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf::cudaGraphBase<Creator, Deleter>::zero(T* dst, size_t count)

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf::cudaGraphBase<Creator, Deleter>::fill(T* dst, T value, size_t count)

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf::cudaGraphBase<Creator, Deleter>::copy(T* tgt, const T* src, size_t num)

template<typename Creator, typename Deleter> template<typename C>
cudaTask tf::cudaGraphBase<Creator, Deleter>::single_task(C c)

template<typename Creator, typename Deleter> template<typename I, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf::cudaGraphBase<Creator, Deleter>::for_each(I first, I last, C callable)

template<typename Creator, typename Deleter> template<typename I, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf::cudaGraphBase<Creator, Deleter>::for_each_index(I first, I last, I step, C callable)

template<typename Creator, typename Deleter> template<typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf::cudaGraphBase<Creator, Deleter>::transform(I first, I last, O output, C op)

template<typename Creator, typename Deleter> template<typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
cudaTask tf::cudaGraphBase<Creator, Deleter>::transform(I1 first1, I1 last1, I2 first2, O output, C op)