template<typename Creator, typename Deleter>
tf::cudaGraphBase class

class to create a CUDA graph managed by C++ smart pointer

Template parameters
Creator functor to create the stream (used in constructor)
Deleter functor to delete the stream (used in destructor)

This class wraps a cudaGraph_t handle with std::unique_ptr to ensure proper resource management and automatic cleanup.

Derived classes

class cudaFlow
class to create a cudaFlow task dependency graph

Public types

using base_type = std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, Deleter>
base std::unique_ptr type

Constructors, destructors, conversion operators

template<typename... ArgsT>
cudaGraphBase(ArgsT && ... args) explicit
constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator
operator cudaGraph_t() const noexcept
implicit conversion to the underlying cudaGraph_t object

Public functions

auto num_nodes() const -> size_t
queries the number of nodes in a native CUDA graph
auto num_edges() const -> size_t
queries the number of edges in a native CUDA graph
auto empty() const -> bool
queries if the graph is empty
void dump(std::ostream& os)
dumps the CUDA graph to a DOT format through the given output stream
auto noop() -> cudaTask
creates a no-operation task
template<typename C>
auto host(C&& callable, void* user_data) -> cudaTask
creates a host task that runs a callable on the host
template<typename F, typename... ArgsT>
auto kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args) -> cudaTask
creates a kernel task
auto memset(void* dst, int v, size_t count) -> cudaTask
creates a memset task that fills untyped data with a byte value
auto memcpy(void* tgt, const void* src, size_t bytes) -> cudaTask
creates a memcpy task that copies untyped data in bytes
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
auto zero(T* dst, size_t count) -> cudaTask
creates a memset task that sets a typed memory block to zero
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
auto fill(T* dst, T value, size_t count) -> cudaTask
creates a memset task that fills a typed memory block with a value
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
auto copy(T* tgt, const T* src, size_t num) -> cudaTask
creates a memcopy task that copies typed data

Function documentation

template<typename Creator, typename Deleter> template<typename... ArgsT>
tf::cudaGraphBase<Creator, Deleter>::cudaGraphBase(ArgsT && ... args) explicit

constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator

Parameters
args arguments to pass to the executable CUDA graph creator

Constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator

template<typename Creator, typename Deleter>
tf::cudaGraphBase<Creator, Deleter>::operator cudaGraph_t() const noexcept

implicit conversion to the underlying cudaGraph_t object

Returns the underlying cudaGraph_t object, equivalently calling base_type::get().

template<typename Creator, typename Deleter>
void tf::cudaGraphBase<Creator, Deleter>::dump(std::ostream& os)

dumps the CUDA graph to a DOT format through the given output stream

Parameters
os target output stream

template<typename Creator, typename Deleter>
cudaTask tf::cudaGraphBase<Creator, Deleter>::noop()

creates a no-operation task

Returns a tf::cudaTask handle

An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.

template<typename Creator, typename Deleter> template<typename C>
cudaTask tf::cudaGraphBase<Creator, Deleter>::host(C&& callable, void* user_data)

creates a host task that runs a callable on the host

Template parameters
C callable type
Parameters
callable a callable object with neither arguments nor return (i.e., constructible from std::function<void()>)
user_data a pointer to the user data
Returns a tf::cudaTask handle

A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc).

template<typename Creator, typename Deleter> template<typename F, typename... ArgsT>
cudaTask tf::cudaGraphBase<Creator, Deleter>::kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)

creates a kernel task

Template parameters
F kernel function type
ArgsT kernel function parameters type
Parameters
g configured grid
b configured block
s configured shared memory size in bytes
f kernel function
args arguments to forward to the kernel function by copy
Returns a tf::cudaTask handle

template<typename Creator, typename Deleter>
cudaTask tf::cudaGraphBase<Creator, Deleter>::memset(void* dst, int v, size_t count)

creates a memset task that fills untyped data with a byte value

Parameters
dst pointer to the destination device memory area
v value to set for each byte of specified memory
count size in bytes to set
Returns a tf::cudaTask handle

A memset task fills the first count bytes of device memory area pointed by dst with the byte value v.

template<typename Creator, typename Deleter>
cudaTask tf::cudaGraphBase<Creator, Deleter>::memcpy(void* tgt, const void* src, size_t bytes)

creates a memcpy task that copies untyped data in bytes

Parameters
tgt pointer to the target memory block
src pointer to the source memory block
bytes bytes to copy
Returns a tf::cudaTask handle

A memcpy task transfers bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf::cudaGraphBase<Creator, Deleter>::zero(T* dst, size_t count)

creates a memset task that sets a typed memory block to zero

Template parameters
T element type (size of T must be either 1, 2, or 4)
Parameters
dst pointer to the destination device memory area
count number of elements
Returns a tf::cudaTask handle

A zero task zeroes the first count elements of type T in a device memory area pointed by dst.

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf::cudaGraphBase<Creator, Deleter>::fill(T* dst, T value, size_t count)

creates a memset task that fills a typed memory block with a value

Template parameters
T element type (size of T must be either 1, 2, or 4)
Parameters
dst pointer to the destination device memory area
value value to fill for each element of type T
count number of elements
Returns a tf::cudaTask handle

A fill task fills the first count elements of type T with value in a device memory area pointed by dst. The value to fill is interpreted in type T rather than byte.

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf::cudaGraphBase<Creator, Deleter>::copy(T* tgt, const T* src, size_t num)

creates a memcopy task that copies typed data

Template parameters
T element type (non-void)
Parameters
tgt pointer to the target memory block
src pointer to the source memory block
num number of elements to copy
Returns a tf::cudaTask handle

A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.