Loading...
Searching...
No Matches
tf::cudaGraphBase< Creator, Deleter > Class Template Reference

class to create a CUDA graph with uunique ownership More...

#include <taskflow/cuda/cuda_graph.hpp>

Inheritance diagram for tf::cudaGraphBase< Creator, Deleter >:
[legend]
Collaboration diagram for tf::cudaGraphBase< Creator, Deleter >:
[legend]

Public Types

using base_type = std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, Deleter>
 base std::unique_ptr type
 

Public Member Functions

template<typename... ArgsT>
 cudaGraphBase (ArgsT &&... args)
 constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator
 
 cudaGraphBase (cudaGraphBase &&)=default
 constructs a cudaGraph from the given rhs using move semantics
 
cudaGraphBaseoperator= (cudaGraphBase &&)=default
 assign the rhs to *this using move semantics
 
size_t num_nodes () const
 queries the number of nodes in a native CUDA graph
 
size_t num_edges () const
 queries the number of edges in a native CUDA graph
 
bool empty () const
 queries if the graph is empty
 
void dump (std::ostream &os)
 dumps the CUDA graph to a DOT format through the given output stream
 
cudaTask noop ()
 creates a no-operation task
 
template<typename C >
cudaTask host (C &&callable, void *user_data)
 creates a host task that runs a callable on the host
 
template<typename F , typename... ArgsT>
cudaTask kernel (dim3 g, dim3 b, size_t s, F f, ArgsT... args)
 creates a kernel task
 
cudaTask memset (void *dst, int v, size_t count)
 creates a memset task that fills untyped data with a byte value
 
cudaTask memcpy (void *tgt, const void *src, size_t bytes)
 creates a memcpy task that copies untyped data in bytes
 
template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * = nullptr>
cudaTask zero (T *dst, size_t count)
 creates a memset task that sets a typed memory block to zero
 
template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * = nullptr>
cudaTask fill (T *dst, T value, size_t count)
 creates a memset task that fills a typed memory block with a value
 
template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * = nullptr>
cudaTask copy (T *tgt, const T *src, size_t num)
 creates a memcopy task that copies typed data
 
template<typename C >
cudaTask single_task (C c)
 runs a callable with only a single kernel thread
 
template<typename I , typename C , typename E = cudaDefaultExecutionPolicy>
cudaTask for_each (I first, I last, C callable)
 applies a callable to each dereferenced element of the data array
 
template<typename I , typename C , typename E = cudaDefaultExecutionPolicy>
cudaTask for_each_index (I first, I last, I step, C callable)
 applies a callable to each index in the range with the step size
 
template<typename I , typename O , typename C , typename E = cudaDefaultExecutionPolicy>
cudaTask transform (I first, I last, O output, C op)
 applies a callable to a source range and stores the result in a target range
 
template<typename I1 , typename I2 , typename O , typename C , typename E = cudaDefaultExecutionPolicy>
cudaTask transform (I1 first1, I1 last1, I2 first2, O output, C op)
 creates a task to perform parallel transforms over two ranges of items
 

Detailed Description

template<typename Creator, typename Deleter>
class tf::cudaGraphBase< Creator, Deleter >

class to create a CUDA graph with uunique ownership

Template Parameters
Creatorfunctor to create the stream (used in constructor)
Deleterfunctor to delete the stream (used in destructor)

This class wraps a cudaGraph_t handle with std::unique_ptr to ensure proper resource management and automatic cleanup.

Constructor & Destructor Documentation

◆ cudaGraphBase()

template<typename Creator , typename Deleter >
template<typename... ArgsT>
tf::cudaGraphBase< Creator, Deleter >::cudaGraphBase ( ArgsT &&... args)
inlineexplicit

constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator

Constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator

Parameters
argsarguments to pass to the executable CUDA graph creator

Member Function Documentation

◆ copy()

template<typename Creator , typename Deleter >
template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * >
cudaTask tf::cudaGraphBase< Creator, Deleter >::copy ( T * tgt,
const T * src,
size_t num )

creates a memcopy task that copies typed data

Template Parameters
Telement type (non-void)
Parameters
tgtpointer to the target memory block
srcpointer to the source memory block
numnumber of elements to copy
Returns
a tf::cudaTask handle

A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

◆ dump()

template<typename Creator , typename Deleter >
void tf::cudaGraphBase< Creator, Deleter >::dump ( std::ostream & os)

dumps the CUDA graph to a DOT format through the given output stream

Parameters
ostarget output stream

◆ fill()

template<typename Creator , typename Deleter >
template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * >
cudaTask tf::cudaGraphBase< Creator, Deleter >::fill ( T * dst,
T value,
size_t count )

creates a memset task that fills a typed memory block with a value

Template Parameters
Telement type (size of T must be either 1, 2, or 4)
Parameters
dstpointer to the destination device memory area
valuevalue to fill for each element of type T
countnumber of elements
Returns
a tf::cudaTask handle

A fill task fills the first count elements of type T with value in a device memory area pointed by dst. The value to fill is interpreted in type T rather than byte.

◆ for_each()

template<typename Creator , typename Deleter >
template<typename I , typename C , typename E >
cudaTask tf::cudaGraphBase< Creator, Deleter >::for_each ( I first,
I last,
C callable )

applies a callable to each dereferenced element of the data array

Template Parameters
Iiterator type
Ccallable type
Eexecution poligy (default tf::cudaDefaultExecutionPolicy)
Parameters
firstiterator to the beginning (inclusive)
lastiterator to the end (exclusive)
callablea callable object to apply to the dereferenced iterator
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; itr++) {
callable(*itr);
}

◆ for_each_index()

template<typename Creator , typename Deleter >
template<typename I , typename C , typename E >
cudaTask tf::cudaGraphBase< Creator, Deleter >::for_each_index ( I first,
I last,
I step,
C callable )

applies a callable to each index in the range with the step size

Template Parameters
Iindex type
Ccallable type
Eexecution poligy (default tf::cudaDefaultExecutionPolicy)
Parameters
firstbeginning index
lastlast index
stepstep size
callablethe callable to apply to each element in the data array
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

// step is positive [first, last)
for(auto i=first; i<last; i+=step) {
callable(i);
}
// step is negative [first, last)
for(auto i=first; i>last; i+=step) {
callable(i);
}

◆ host()

template<typename Creator , typename Deleter >
template<typename C >
cudaTask tf::cudaGraphBase< Creator, Deleter >::host ( C && callable,
void * user_data )

creates a host task that runs a callable on the host

Template Parameters
Ccallable type
Parameters
callablea callable object with neither arguments nor return (i.e., constructible from std::function<void()>)
user_dataa pointer to the user data
Returns
a tf::cudaTask handle

A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc).

◆ kernel()

template<typename Creator , typename Deleter >
template<typename F , typename... ArgsT>
cudaTask tf::cudaGraphBase< Creator, Deleter >::kernel ( dim3 g,
dim3 b,
size_t s,
F f,
ArgsT... args )

creates a kernel task

Template Parameters
Fkernel function type
ArgsTkernel function parameters type
Parameters
gconfigured grid
bconfigured block
sconfigured shared memory size in bytes
fkernel function
argsarguments to forward to the kernel function by copy
Returns
a tf::cudaTask handle

◆ memcpy()

template<typename Creator , typename Deleter >
cudaTask tf::cudaGraphBase< Creator, Deleter >::memcpy ( void * tgt,
const void * src,
size_t bytes )

creates a memcpy task that copies untyped data in bytes

Parameters
tgtpointer to the target memory block
srcpointer to the source memory block
bytesbytes to copy
Returns
a tf::cudaTask handle

A memcpy task transfers bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

◆ memset()

template<typename Creator , typename Deleter >
cudaTask tf::cudaGraphBase< Creator, Deleter >::memset ( void * dst,
int v,
size_t count )

creates a memset task that fills untyped data with a byte value

Parameters
dstpointer to the destination device memory area
vvalue to set for each byte of specified memory
countsize in bytes to set
Returns
a tf::cudaTask handle

A memset task fills the first count bytes of device memory area pointed by dst with the byte value v.

◆ noop()

template<typename Creator , typename Deleter >
cudaTask tf::cudaGraphBase< Creator, Deleter >::noop ( )

creates a no-operation task

Returns
a tf::cudaTask handle

An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.

◆ single_task()

template<typename Creator , typename Deleter >
template<typename C >
cudaTask tf::cudaGraphBase< Creator, Deleter >::single_task ( C c)

runs a callable with only a single kernel thread

Template Parameters
Ccallable type
Parameters
ccallable to run by a single kernel thread
Returns
a tf::cudaTask handle

◆ transform() [1/2]

template<typename Creator , typename Deleter >
template<typename I , typename O , typename C , typename E >
cudaTask tf::cudaGraphBase< Creator, Deleter >::transform ( I first,
I last,
O output,
C op )

applies a callable to a source range and stores the result in a target range

Template Parameters
Iinput iterator type
Ooutput iterator type
Cunary operator type
Eexecution poligy (default tf::cudaDefaultExecutionPolicy)
Parameters
firstiterator to the beginning of the input range
lastiterator to the end of the input range
outputiterator to the beginning of the output range
opthe operator to apply to transform each element in the range
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
*output++ = callable(*first++);
}

◆ transform() [2/2]

template<typename Creator , typename Deleter >
template<typename I1 , typename I2 , typename O , typename C , typename E >
cudaTask tf::cudaGraphBase< Creator, Deleter >::transform ( I1 first1,
I1 last1,
I2 first2,
O output,
C op )

creates a task to perform parallel transforms over two ranges of items

Template Parameters
I1first input iterator type
I2second input iterator type
Ooutput iterator type
Cunary operator type
Eexecution poligy (default tf::cudaDefaultExecutionPolicy)
Parameters
first1iterator to the beginning of the input range
last1iterator to the end of the input range
first2iterato
outputiterator to the beginning of the output range
opbinary operator to apply to transform each pair of items in the two input ranges
Returns
cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {
*output++ = op(*first1++, *first2++);
}

◆ zero()

template<typename Creator , typename Deleter >
template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * >
cudaTask tf::cudaGraphBase< Creator, Deleter >::zero ( T * dst,
size_t count )

creates a memset task that sets a typed memory block to zero

Template Parameters
Telement type (size of T must be either 1, 2, or 4)
Parameters
dstpointer to the destination device memory area
countnumber of elements
Returns
a tf::cudaTask handle

A zero task zeroes the first count elements of type T in a device memory area pointed by dst.


The documentation for this class was generated from the following files: