5#include "cuda_memory.hpp"
6#include "cuda_stream.hpp"
7#include "cuda_meta.hpp"
9#include "../utility/traits.hpp"
21 std::enable_if_t<!std::is_same_v<T, void>,
void>* =
nullptr
25 using U = std::decay_t<T>;
30 p.srcPos = ::make_cudaPos(0, 0, 0);
31 p.srcPtr = ::make_cudaPitchedPtr(
const_cast<T*
>(src), num*
sizeof(U), num, 1);
33 p.dstPos = ::make_cudaPos(0, 0, 0);
34 p.dstPtr = ::make_cudaPitchedPtr(tgt, num*
sizeof(U), num, 1);
35 p.extent = ::make_cudaExtent(num*
sizeof(U), 1, 1);
36 p.kind = cudaMemcpyDefault;
45 void* tgt,
const void* src,
size_t bytes
55 p.srcPos = ::make_cudaPos(0, 0, 0);
56 p.srcPtr = ::make_cudaPitchedPtr(
const_cast<void*
>(src), bytes, bytes, 1);
58 p.dstPos = ::make_cudaPos(0, 0, 0);
59 p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1);
60 p.extent = ::make_cudaExtent(bytes, 1, 1);
61 p.kind = cudaMemcpyDefault;
87template <
typename T, std::enable_if_t<
88 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>* =
nullptr
97 static_assert(
sizeof(T) <=
sizeof(p.value),
"internal error");
98 std::memcpy(&p.value, &value,
sizeof(T));
101 p.elementSize =
sizeof(T);
111template <
typename T, std::enable_if_t<
112 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>* =
nullptr
120 p.elementSize =
sizeof(T);
133 cudaGraphGetRootNodes(graph,
nullptr, &num_nodes),
134 "failed to get native graph root nodes"
145 cudaGraphGetNodes(graph,
nullptr, &num_nodes),
146 "failed to get native graph nodes"
157 TF_CUDA_PRE13(cudaGraphGetEdges(graph, from, to, &num_edges))
158 TF_CUDA_POST13(cudaGraphGetEdges(graph, from, to,
nullptr, &num_edges)),
159 "failed to get native graph edges"
171 size_t num_predecessors;
173 TF_CUDA_PRE13(cudaGraphNodeGetDependencies(node, dependencies, &num_predecessors))
174 TF_CUDA_POST13(cudaGraphNodeGetDependencies(node, dependencies,
nullptr, &num_predecessors)),
175 "Failed to get number of dependencies");
176 return num_predecessors;
186 size_t num_successors;
188 TF_CUDA_PRE13(cudaGraphNodeGetDependentNodes(node, dependent_nodes, &num_successors))
189 TF_CUDA_POST13(cudaGraphNodeGetDependentNodes(node, dependent_nodes,
nullptr, &num_successors)),
190 "Failed to get CUDA dependent nodes");
191 return num_successors;
203 TF_CUDA_PRE13(cudaGraphAddDependencies(graph, from, to, numDependencies))
204 TF_CUDA_POST13(cudaGraphAddDependencies(graph, from, to,
nullptr, numDependencies)),
205 "Failed to add CUDA graph node dependencies"
223 std::vector<cudaGraphNode_t> nodes(num_nodes);
225 cudaGraphGetNodes(graph, nodes.data(), &num_nodes),
226 "failed to get native graph nodes"
236 std::vector<cudaGraphNode_t> nodes(num_nodes);
238 cudaGraphGetRootNodes(graph, nodes.data(), &num_nodes),
239 "failed to get native graph nodes"
247inline std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>>
250 std::vector<cudaGraphNode_t> froms(num_edges), tos(num_edges);
252 std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>> edges(num_edges);
253 for(
size_t i=0; i<num_edges; i++) {
254 edges[i] = std::make_pair(froms[i], tos[i]);
273 cudaGraphNodeType type;
275 cudaGraphNodeGetType(node, &type),
"failed to get native graph node type"
287constexpr const char*
to_string(cudaGraphNodeType type) {
289 case cudaGraphNodeTypeKernel:
return "Kernel";
290 case cudaGraphNodeTypeMemcpy:
return "Memcpy";
291 case cudaGraphNodeTypeMemset:
return "Memset";
292 case cudaGraphNodeTypeHost:
return "Host";
293 case cudaGraphNodeTypeGraph:
return "Graph";
294 case cudaGraphNodeTypeEmpty:
return "Empty";
295 case cudaGraphNodeTypeWaitEvent:
return "WaitEvent";
296 case cudaGraphNodeTypeEventRecord:
return "EventRecord";
297 case cudaGraphNodeTypeExtSemaphoreSignal:
return "ExtSemaphoreSignal";
298 case cudaGraphNodeTypeExtSemaphoreWait:
return "ExtSemaphoreWait";
299 case cudaGraphNodeTypeMemAlloc:
return "MemAlloc";
300 case cudaGraphNodeTypeMemFree:
return "MemFree";
301 case cudaGraphNodeTypeConditional:
return "Conditional";
302 default:
return "undefined";
317 template <
typename Creator,
typename Deleter>
318 friend class cudaGraphBase;
320 template <
typename Creator,
typename Deleter>
321 friend class cudaGraphExecBase;
323 friend class cudaFlow;
324 friend class cudaFlowCapturer;
325 friend class cudaFlowCapturerBase;
355 template <
typename... Ts>
367 template <
typename... Ts>
390 void dump(std::ostream& os)
const;
394 cudaTask(cudaGraph_t, cudaGraphNode_t);
396 cudaGraph_t _native_graph {
nullptr};
397 cudaGraphNode_t _native_node {
nullptr};
402 _native_graph {native_graph}, _native_node {native_node} {
406template <
typename... Ts>
410 _native_graph, &_native_node, &(tasks._native_node), 1
417template <
typename... Ts>
419 (tasks.precede(*
this), ...);
435 cudaGraphNodeType
type;
436 cudaGraphNodeGetType(_native_node, &
type);
480 TF_CHECK_CUDA(cudaGraphCreate(&g, 0),
"failed to create a CUDA native graph");
530template <
typename Creator,
typename Deleter>
531class cudaGraphBase :
public std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, cudaGraphDeleter> {
533 static_assert(std::is_pointer_v<cudaGraph_t>,
"cudaGraph_t is not a pointer type");
540 using base_type = std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, Deleter>;
549 template <
typename... ArgsT>
551 Creator{}(std::forward<ArgsT>(args)...), Deleter()
619 template <
typename C>
636 template <
typename F,
typename... ArgsT>
679 template <
typename T, std::enable_if_t<
680 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>* =
nullptr
699 template <
typename T, std::enable_if_t<
700 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>* =
nullptr
718 template <
typename T,
719 std::enable_if_t<!std::is_same_v<T, void>,
void>* =
nullptr
736 template <
typename C>
760 template <
typename I,
typename C,
typename E = cudaDefaultExecutionPolicy>
791 template <
typename I,
typename C,
typename E = cudaDefaultExecutionPolicy>
817 template <
typename I,
typename O,
typename C,
typename E = cudaDefaultExecutionPolicy>
846 template <
typename I1,
typename I2,
typename O,
typename C,
typename E = cudaDefaultExecutionPolicy>
856template <
typename Creator,
typename Deleter>
860 cudaGraphGetNodes(this->get(),
nullptr, &n),
861 "failed to get native graph nodes"
867template <
typename Creator,
typename Deleter>
873template <
typename Creator,
typename Deleter>
954template <
typename Creator,
typename Deleter>
958 auto temp_path = std::filesystem::temp_directory_path() /
"graph_";
959 std::random_device rd;
960 std::uniform_int_distribution<int> dist(100000, 999999);
961 temp_path += std::to_string(dist(rd)) +
".dot";
964 TF_CHECK_CUDA(cudaGraphDebugDotPrint(this->get(), temp_path.string().c_str(), 0),
"");
967 std::ifstream file(temp_path);
971 std::filesystem::remove(temp_path);
973 TF_THROW(
"failed to open ", temp_path,
" for dumping the CUDA graph");
978template <
typename Creator,
typename Deleter>
981 cudaGraphNode_t node;
984 cudaGraphAddEmptyNode(&node, this->get(),
nullptr, 0),
985 "failed to create a no-operation (empty) node"
992template <
typename Creator,
typename Deleter>
996 cudaGraphNode_t node;
997 cudaHostNodeParams p {callable, user_data};
1000 cudaGraphAddHostNode(&node, this->get(),
nullptr, 0, &p),
1001 "failed to create a host node"
1004 return cudaTask(this->get(), node);
1008template <
typename Creator,
typename Deleter>
1009template <
typename F,
typename... ArgsT>
1011 dim3 g, dim3 b,
size_t s, F f, ArgsT... args
1014 cudaGraphNode_t node;
1015 cudaKernelNodeParams p;
1017 void* arguments[
sizeof...(ArgsT)] = { (
void*)(&args)... };
1022 p.sharedMemBytes = s;
1023 p.kernelParams = arguments;
1027 cudaGraphAddKernelNode(&node, this->get(),
nullptr, 0, &p),
1028 "failed to create a kernel task"
1031 return cudaTask(this->get(), node);
1035template <
typename Creator,
typename Deleter>
1036template <
typename T, std::enable_if_t<
1037 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>*
1041 cudaGraphNode_t node;
1045 cudaGraphAddMemsetNode(&node, this->get(),
nullptr, 0, &p),
1046 "failed to create a memset (zero) task"
1049 return cudaTask(this->get(), node);
1053template <
typename Creator,
typename Deleter>
1054template <
typename T, std::enable_if_t<
1055 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>*
1059 cudaGraphNode_t node;
1062 cudaGraphAddMemsetNode(&node, this->get(),
nullptr, 0, &p),
1063 "failed to create a memset (fill) task"
1066 return cudaTask(this->get(), node);
1070template <
typename Creator,
typename Deleter>
1073 std::enable_if_t<!std::is_same_v<T, void>,
void>*
1077 cudaGraphNode_t node;
1081 cudaGraphAddMemcpyNode(&node, this->get(),
nullptr, 0, &p),
1082 "failed to create a memcpy (copy) task"
1085 return cudaTask(this->get(), node);
1089template <
typename Creator,
typename Deleter>
1092 cudaGraphNode_t node;
1096 cudaGraphAddMemsetNode(&node, this->get(),
nullptr, 0, &p),
1097 "failed to create a memset task"
1100 return cudaTask(this->get(), node);
1104template <
typename Creator,
typename Deleter>
1107 cudaGraphNode_t node;
1111 cudaGraphAddMemcpyNode(&node, this->get(),
nullptr, 0, &p),
1112 "failed to create a memcpy task"
1115 return cudaTask(this->get(), node);
cudaTask copy(T *tgt, const T *src, size_t num)
creates a memcopy task that copies typed data
Definition cuda_graph.hpp:1075
size_t num_edges() const
queries the number of edges in a native CUDA graph
Definition cuda_graph.hpp:874
cudaTask for_each(I first, I last, C callable)
applies a callable to each dereferenced element of the data array
Definition for_each.hpp:53
cudaTask memset(void *dst, int v, size_t count)
creates a memset task that fills untyped data with a byte value
Definition cuda_graph.hpp:1090
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)
creates a kernel task
Definition cuda_graph.hpp:1010
cudaTask fill(T *dst, T value, size_t count)
creates a memset task that fills a typed memory block with a value
Definition cuda_graph.hpp:1057
cudaGraphBase(cudaGraphBase &&)=default
constructs a cudaGraph from the given rhs using move semantics
cudaTask host(C &&callable, void *user_data)
creates a host task that runs a callable on the host
Definition cuda_graph.hpp:994
bool empty() const
queries if the graph is empty
Definition cuda_graph.hpp:868
cudaTask memcpy(void *tgt, const void *src, size_t bytes)
creates a memcpy task that copies untyped data in bytes
Definition cuda_graph.hpp:1105
cudaGraphBase(ArgsT &&... args)
constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator
Definition cuda_graph.hpp:550
std::unique_ptr< std::remove_pointer_t< cudaGraph_t >, Deleter > base_type
base std::unique_ptr type
Definition cuda_graph.hpp:540
cudaTask transform(I first, I last, O output, C op)
applies a callable to a source range and stores the result in a target range
Definition transform.hpp:65
cudaTask zero(T *dst, size_t count)
creates a memset task that sets a typed memory block to zero
Definition cuda_graph.hpp:1039
cudaTask single_task(C c)
runs a callable with only a single kernel thread
void dump(std::ostream &os)
dumps the CUDA graph to a DOT format through the given output stream
Definition cuda_graph.hpp:955
cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op)
creates a task to perform parallel transforms over two ranges of items
Definition transform.hpp:79
cudaTask for_each_index(I first, I last, I step, C callable)
applies a callable to each index in the range with the step size
Definition for_each.hpp:79
size_t num_nodes() const
queries the number of nodes in a native CUDA graph
Definition cuda_graph.hpp:857
cudaGraphBase & operator=(cudaGraphBase &&)=default
assign the rhs to *this using move semantics
cudaTask noop()
creates a no-operation task
Definition cuda_graph.hpp:979
class to create functors that construct CUDA graphs
Definition cuda_graph.hpp:465
cudaGraph_t operator()() const
creates a new CUDA graph
Definition cuda_graph.hpp:478
class to create a functor that deletes a CUDA graph
Definition cuda_graph.hpp:502
void operator()(cudaGraph_t g) const
deletes a CUDA graph
Definition cuda_graph.hpp:513
class to create a task handle of a CUDA Graph node
Definition cuda_graph.hpp:315
cudaTask(const cudaTask &)=default
copy-constructs a cudaTask
cudaTask & succeed(Ts &&... tasks)
adds precedence links from other tasks to this
Definition cuda_graph.hpp:418
friend std::ostream & operator<<(std::ostream &, const cudaTask &)
overload of ostream inserter operator for cudaTask
Definition cuda_graph.hpp:448
size_t num_predecessors() const
queries the number of dependents
Definition cuda_graph.hpp:424
size_t num_successors() const
queries the number of successors
Definition cuda_graph.hpp:429
cudaTask()=default
constructs an empty cudaTask
auto type() const
queries the type of this task
Definition cuda_graph.hpp:434
cudaTask & operator=(const cudaTask &)=default
copy-assigns a cudaTask
cudaTask & precede(Ts &&... tasks)
adds precedence links from this to other tasks
Definition cuda_graph.hpp:407
void dump(std::ostream &os) const
dumps the task through an output stream
Definition cuda_graph.hpp:441
taskflow namespace
Definition small_vector.hpp:20
const char * to_string(TaskType type)
convert a task type to a human-readable string
Definition task.hpp:66
cudaMemsetParams cuda_get_zero_parms(T *dst, size_t count)
gets the memset node parameter of a zero task (typed)
Definition cuda_graph.hpp:114
std::vector< cudaGraphNode_t > cuda_graph_get_root_nodes(cudaGraph_t graph)
acquires the root nodes in a native CUDA graph
Definition cuda_graph.hpp:234
size_t cuda_graph_node_get_dependencies(cudaGraphNode_t node, cudaGraphNode_t *dependencies)
Handles compatibility with CUDA <= 12.x and CUDA 13.
Definition cuda_graph.hpp:170
size_t cuda_graph_node_get_dependent_nodes(cudaGraphNode_t node, cudaGraphNode_t *dependent_nodes)
Handles compatibility with CUDA <= 12.x and CUDA 13.
Definition cuda_graph.hpp:185
std::vector< cudaGraphNode_t > cuda_graph_get_nodes(cudaGraph_t graph)
acquires the nodes in a native CUDA graph
Definition cuda_graph.hpp:221
cudaMemcpy3DParms cuda_get_memcpy_parms(void *tgt, const void *src, size_t bytes)
gets the memcpy node parameter of a memcpy task (untyped)
Definition cuda_graph.hpp:44
size_t cuda_graph_get_num_nodes(cudaGraph_t graph)
queries the number of nodes in a native CUDA graph
Definition cuda_graph.hpp:142
size_t cuda_graph_get_num_root_nodes(cudaGraph_t graph)
queries the number of root nodes in a native CUDA graph
Definition cuda_graph.hpp:130
cudaMemsetParams cuda_get_memset_parms(void *dst, int ch, size_t count)
gets the memset node parameter of a memcpy task (untyped)
Definition cuda_graph.hpp:69
cudaMemsetParams cuda_get_fill_parms(T *dst, T value, size_t count)
gets the memset node parameter of a fill task (typed)
Definition cuda_graph.hpp:90
std::ostream & operator<<(std::ostream &os, const Task &task)
overload of ostream inserter operator for Task
Definition task.hpp:1221
size_t cuda_graph_get_num_edges(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to)
Handles compatibility with CUDA <= 12.x and CUDA == 13.x.
Definition cuda_graph.hpp:154
cudaMemcpy3DParms cuda_get_copy_parms(T *tgt, const T *src, size_t num)
gets the memcpy node parameter of a copy task
Definition cuda_graph.hpp:23
void cuda_graph_add_dependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies)
Handles compatibility with CUDA <= 12.x and CUDA 13.
Definition cuda_graph.hpp:201
std::vector< std::pair< cudaGraphNode_t, cudaGraphNode_t > > cuda_graph_get_edges(cudaGraph_t graph)
acquires the edges in a native CUDA graph
Definition cuda_graph.hpp:248
cudaGraphNodeType cuda_get_graph_node_type(cudaGraphNode_t node)
queries the type of a native CUDA graph node
Definition cuda_graph.hpp:272