class to create a CUDA graph with uunique ownership More...
#include <taskflow/cuda/cuda_graph.hpp>
Public Types | |
| using | base_type = std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, Deleter> |
| base std::unique_ptr type | |
Public Member Functions | |
| template<typename... ArgsT> | |
| cudaGraphBase (ArgsT &&... args) | |
constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator | |
| cudaGraphBase (cudaGraphBase &&)=default | |
constructs a cudaGraph from the given rhs using move semantics | |
| cudaGraphBase & | operator= (cudaGraphBase &&)=default |
assign the rhs to *this using move semantics | |
| size_t | num_nodes () const |
| queries the number of nodes in a native CUDA graph | |
| size_t | num_edges () const |
| queries the number of edges in a native CUDA graph | |
| bool | empty () const |
| queries if the graph is empty | |
| void | dump (std::ostream &os) |
| dumps the CUDA graph to a DOT format through the given output stream | |
| cudaTask | noop () |
| creates a no-operation task | |
| template<typename C > | |
| cudaTask | host (C &&callable, void *user_data) |
| creates a host task that runs a callable on the host | |
| template<typename F , typename... ArgsT> | |
| cudaTask | kernel (dim3 g, dim3 b, size_t s, F f, ArgsT... args) |
| creates a kernel task | |
| cudaTask | memset (void *dst, int v, size_t count) |
| creates a memset task that fills untyped data with a byte value | |
| cudaTask | memcpy (void *tgt, const void *src, size_t bytes) |
| creates a memcpy task that copies untyped data in bytes | |
| template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * = nullptr> | |
| cudaTask | zero (T *dst, size_t count) |
| creates a memset task that sets a typed memory block to zero | |
| template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * = nullptr> | |
| cudaTask | fill (T *dst, T value, size_t count) |
| creates a memset task that fills a typed memory block with a value | |
| template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * = nullptr> | |
| cudaTask | copy (T *tgt, const T *src, size_t num) |
| creates a memcopy task that copies typed data | |
| template<typename C > | |
| cudaTask | single_task (C c) |
| runs a callable with only a single kernel thread | |
| template<typename I , typename C , typename E = cudaDefaultExecutionPolicy> | |
| cudaTask | for_each (I first, I last, C callable) |
| applies a callable to each dereferenced element of the data array | |
| template<typename I , typename C , typename E = cudaDefaultExecutionPolicy> | |
| cudaTask | for_each_index (I first, I last, I step, C callable) |
| applies a callable to each index in the range with the step size | |
| template<typename I , typename O , typename C , typename E = cudaDefaultExecutionPolicy> | |
| cudaTask | transform (I first, I last, O output, C op) |
| applies a callable to a source range and stores the result in a target range | |
| template<typename I1 , typename I2 , typename O , typename C , typename E = cudaDefaultExecutionPolicy> | |
| cudaTask | transform (I1 first1, I1 last1, I2 first2, O output, C op) |
| creates a task to perform parallel transforms over two ranges of items | |
class to create a CUDA graph with uunique ownership
| Creator | functor to create the stream (used in constructor) |
| Deleter | functor to delete the stream (used in destructor) |
This class wraps a cudaGraph_t handle with std::unique_ptr to ensure proper resource management and automatic cleanup.
|
inlineexplicit |
constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator
Constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator
| args | arguments to pass to the executable CUDA graph creator |
| cudaTask tf::cudaGraphBase< Creator, Deleter >::copy | ( | T * | tgt, |
| const T * | src, | ||
| size_t | num ) |
creates a memcopy task that copies typed data
| T | element type (non-void) |
| tgt | pointer to the target memory block |
| src | pointer to the source memory block |
| num | number of elements to copy |
A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
| void tf::cudaGraphBase< Creator, Deleter >::dump | ( | std::ostream & | os | ) |
dumps the CUDA graph to a DOT format through the given output stream
| os | target output stream |
| cudaTask tf::cudaGraphBase< Creator, Deleter >::fill | ( | T * | dst, |
| T | value, | ||
| size_t | count ) |
creates a memset task that fills a typed memory block with a value
| T | element type (size of T must be either 1, 2, or 4) |
| dst | pointer to the destination device memory area |
| value | value to fill for each element of type T |
| count | number of elements |
A fill task fills the first count elements of type T with value in a device memory area pointed by dst. The value to fill is interpreted in type T rather than byte.
| cudaTask tf::cudaGraphBase< Creator, Deleter >::for_each | ( | I | first, |
| I | last, | ||
| C | callable ) |
applies a callable to each dereferenced element of the data array
| I | iterator type |
| C | callable type |
| E | execution poligy (default tf::cudaDefaultExecutionPolicy) |
| first | iterator to the beginning (inclusive) |
| last | iterator to the end (exclusive) |
| callable | a callable object to apply to the dereferenced iterator |
This method is equivalent to the parallel execution of the following loop on a GPU:
| cudaTask tf::cudaGraphBase< Creator, Deleter >::for_each_index | ( | I | first, |
| I | last, | ||
| I | step, | ||
| C | callable ) |
applies a callable to each index in the range with the step size
| I | index type |
| C | callable type |
| E | execution poligy (default tf::cudaDefaultExecutionPolicy) |
| first | beginning index |
| last | last index |
| step | step size |
| callable | the callable to apply to each element in the data array |
This method is equivalent to the parallel execution of the following loop on a GPU:
| cudaTask tf::cudaGraphBase< Creator, Deleter >::host | ( | C && | callable, |
| void * | user_data ) |
creates a host task that runs a callable on the host
| C | callable type |
| callable | a callable object with neither arguments nor return (i.e., constructible from std::function<void()>) |
| user_data | a pointer to the user data |
A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc).
| cudaTask tf::cudaGraphBase< Creator, Deleter >::kernel | ( | dim3 | g, |
| dim3 | b, | ||
| size_t | s, | ||
| F | f, | ||
| ArgsT... | args ) |
creates a kernel task
| F | kernel function type |
| ArgsT | kernel function parameters type |
| g | configured grid |
| b | configured block |
| s | configured shared memory size in bytes |
| f | kernel function |
| args | arguments to forward to the kernel function by copy |
| cudaTask tf::cudaGraphBase< Creator, Deleter >::memcpy | ( | void * | tgt, |
| const void * | src, | ||
| size_t | bytes ) |
creates a memcpy task that copies untyped data in bytes
| tgt | pointer to the target memory block |
| src | pointer to the source memory block |
| bytes | bytes to copy |
A memcpy task transfers bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
| cudaTask tf::cudaGraphBase< Creator, Deleter >::memset | ( | void * | dst, |
| int | v, | ||
| size_t | count ) |
creates a memset task that fills untyped data with a byte value
| dst | pointer to the destination device memory area |
| v | value to set for each byte of specified memory |
| count | size in bytes to set |
A memset task fills the first count bytes of device memory area pointed by dst with the byte value v.
| cudaTask tf::cudaGraphBase< Creator, Deleter >::noop | ( | ) |
creates a no-operation task
An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.
| cudaTask tf::cudaGraphBase< Creator, Deleter >::single_task | ( | C | c | ) |
runs a callable with only a single kernel thread
| C | callable type |
| c | callable to run by a single kernel thread |
| cudaTask tf::cudaGraphBase< Creator, Deleter >::transform | ( | I | first, |
| I | last, | ||
| O | output, | ||
| C | op ) |
applies a callable to a source range and stores the result in a target range
| I | input iterator type |
| O | output iterator type |
| C | unary operator type |
| E | execution poligy (default tf::cudaDefaultExecutionPolicy) |
| first | iterator to the beginning of the input range |
| last | iterator to the end of the input range |
| output | iterator to the beginning of the output range |
| op | the operator to apply to transform each element in the range |
This method is equivalent to the parallel execution of the following loop on a GPU:
| cudaTask tf::cudaGraphBase< Creator, Deleter >::transform | ( | I1 | first1, |
| I1 | last1, | ||
| I2 | first2, | ||
| O | output, | ||
| C | op ) |
creates a task to perform parallel transforms over two ranges of items
| I1 | first input iterator type |
| I2 | second input iterator type |
| O | output iterator type |
| C | unary operator type |
| E | execution poligy (default tf::cudaDefaultExecutionPolicy) |
| first1 | iterator to the beginning of the input range |
| last1 | iterator to the end of the input range |
| first2 | iterato |
| output | iterator to the beginning of the output range |
| op | binary operator to apply to transform each pair of items in the two input ranges |
This method is equivalent to the parallel execution of the following loop on a GPU:
| cudaTask tf::cudaGraphBase< Creator, Deleter >::zero | ( | T * | dst, |
| size_t | count ) |
creates a memset task that sets a typed memory block to zero
| T | element type (size of T must be either 1, 2, or 4) |
| dst | pointer to the destination device memory area |
| count | number of elements |
A zero task zeroes the first count elements of type T in a device memory area pointed by dst.