Loading...
Searching...
No Matches
cuda_graph.hpp
1#pragma once
2
3#include <filesystem>
4
5#include "cuda_memory.hpp"
6#include "cuda_stream.hpp"
7#include "cuda_meta.hpp"
8
9#include "../utility/traits.hpp"
10
11namespace tf {
12
13// ----------------------------------------------------------------------------
14// cudaGraph_t routines
15// ----------------------------------------------------------------------------
16
20template <typename T,
21 std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
22>
23cudaMemcpy3DParms cuda_get_copy_parms(T* tgt, const T* src, size_t num) {
24
25 using U = std::decay_t<T>;
26
27 cudaMemcpy3DParms p;
28
29 p.srcArray = nullptr;
30 p.srcPos = ::make_cudaPos(0, 0, 0);
31 p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*sizeof(U), num, 1);
32 p.dstArray = nullptr;
33 p.dstPos = ::make_cudaPos(0, 0, 0);
34 p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1);
35 p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1);
36 p.kind = cudaMemcpyDefault;
37
38 return p;
39}
40
44inline cudaMemcpy3DParms cuda_get_memcpy_parms(
45 void* tgt, const void* src, size_t bytes
46) {
47
48 // Parameters in cudaPitchedPtr
49 // d - Pointer to allocated memory
50 // p - Pitch of allocated memory in bytes
51 // xsz - Logical width of allocation in elements
52 // ysz - Logical height of allocation in elements
53 cudaMemcpy3DParms p;
54 p.srcArray = nullptr;
55 p.srcPos = ::make_cudaPos(0, 0, 0);
56 p.srcPtr = ::make_cudaPitchedPtr(const_cast<void*>(src), bytes, bytes, 1);
57 p.dstArray = nullptr;
58 p.dstPos = ::make_cudaPos(0, 0, 0);
59 p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1);
60 p.extent = ::make_cudaExtent(bytes, 1, 1);
61 p.kind = cudaMemcpyDefault;
62
63 return p;
64}
65
69inline cudaMemsetParams cuda_get_memset_parms(void* dst, int ch, size_t count) {
70
71 cudaMemsetParams p;
72 p.dst = dst;
73 p.value = ch;
74 p.pitch = 0;
75 //p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1;
76 //p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count;
77 p.elementSize = 1; // either 1, 2, or 4
78 p.width = count;
79 p.height = 1;
80
81 return p;
82}
83
87template <typename T, std::enable_if_t<
88 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
89>
90cudaMemsetParams cuda_get_fill_parms(T* dst, T value, size_t count) {
91
92 cudaMemsetParams p;
93 p.dst = dst;
94
95 // perform bit-wise copy
96 p.value = 0; // crucial
97 static_assert(sizeof(T) <= sizeof(p.value), "internal error");
98 std::memcpy(&p.value, &value, sizeof(T));
99
100 p.pitch = 0;
101 p.elementSize = sizeof(T); // either 1, 2, or 4
102 p.width = count;
103 p.height = 1;
104
105 return p;
106}
107
111template <typename T, std::enable_if_t<
112 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
113>
114cudaMemsetParams cuda_get_zero_parms(T* dst, size_t count) {
115
116 cudaMemsetParams p;
117 p.dst = dst;
118 p.value = 0;
119 p.pitch = 0;
120 p.elementSize = sizeof(T); // either 1, 2, or 4
121 p.width = count;
122 p.height = 1;
123
124 return p;
125}
126
130inline size_t cuda_graph_get_num_root_nodes(cudaGraph_t graph) {
131 size_t num_nodes;
132 TF_CHECK_CUDA(
133 cudaGraphGetRootNodes(graph, nullptr, &num_nodes),
134 "failed to get native graph root nodes"
135 );
136 return num_nodes;
137}
138
142inline size_t cuda_graph_get_num_nodes(cudaGraph_t graph) {
143 size_t num_nodes;
144 TF_CHECK_CUDA(
145 cudaGraphGetNodes(graph, nullptr, &num_nodes),
146 "failed to get native graph nodes"
147 );
148 return num_nodes;
149}
150
154inline size_t cuda_graph_get_num_edges(cudaGraph_t graph, cudaGraphNode_t* from, cudaGraphNode_t* to) {
155 size_t num_edges;
156 TF_CHECK_CUDA(
157 TF_CUDA_PRE13(cudaGraphGetEdges(graph, from, to, &num_edges))
158 TF_CUDA_POST13(cudaGraphGetEdges(graph, from, to, nullptr, &num_edges)),
159 "failed to get native graph edges"
160 );
161 return num_edges;
162}
163
170inline size_t cuda_graph_node_get_dependencies(cudaGraphNode_t node, cudaGraphNode_t* dependencies) {
171 size_t num_predecessors;
172 TF_CHECK_CUDA(
173 TF_CUDA_PRE13(cudaGraphNodeGetDependencies(node, dependencies, &num_predecessors))
174 TF_CUDA_POST13(cudaGraphNodeGetDependencies(node, dependencies, nullptr, &num_predecessors)),
175 "Failed to get number of dependencies");
176 return num_predecessors;
177}
178
185inline size_t cuda_graph_node_get_dependent_nodes(cudaGraphNode_t node, cudaGraphNode_t *dependent_nodes) {
186 size_t num_successors;
187 TF_CHECK_CUDA(
188 TF_CUDA_PRE13(cudaGraphNodeGetDependentNodes(node, dependent_nodes, &num_successors))
189 TF_CUDA_POST13(cudaGraphNodeGetDependentNodes(node, dependent_nodes, nullptr, &num_successors)),
190 "Failed to get CUDA dependent nodes");
191 return num_successors;
192}
193
201inline void cuda_graph_add_dependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies) {
202 TF_CHECK_CUDA(
203 TF_CUDA_PRE13(cudaGraphAddDependencies(graph, from, to, numDependencies))
204 TF_CUDA_POST13(cudaGraphAddDependencies(graph, from, to, nullptr, numDependencies)),
205 "Failed to add CUDA graph node dependencies"
206 );
207}
208
212inline size_t cuda_graph_get_num_edges(cudaGraph_t graph) {
213 return cuda_graph_get_num_edges(graph, nullptr, nullptr);
214}
215
216
217
221inline std::vector<cudaGraphNode_t> cuda_graph_get_nodes(cudaGraph_t graph) {
222 size_t num_nodes = cuda_graph_get_num_nodes(graph);
223 std::vector<cudaGraphNode_t> nodes(num_nodes);
224 TF_CHECK_CUDA(
225 cudaGraphGetNodes(graph, nodes.data(), &num_nodes),
226 "failed to get native graph nodes"
227 );
228 return nodes;
229}
230
234inline std::vector<cudaGraphNode_t> cuda_graph_get_root_nodes(cudaGraph_t graph) {
235 size_t num_nodes = cuda_graph_get_num_root_nodes(graph);
236 std::vector<cudaGraphNode_t> nodes(num_nodes);
237 TF_CHECK_CUDA(
238 cudaGraphGetRootNodes(graph, nodes.data(), &num_nodes),
239 "failed to get native graph nodes"
240 );
241 return nodes;
242}
243
247inline std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>>
248cuda_graph_get_edges(cudaGraph_t graph) {
249 size_t num_edges = cuda_graph_get_num_edges(graph);
250 std::vector<cudaGraphNode_t> froms(num_edges), tos(num_edges);
251 num_edges = cuda_graph_get_num_edges(graph, froms.data(), tos.data());
252 std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>> edges(num_edges);
253 for(size_t i=0; i<num_edges; i++) {
254 edges[i] = std::make_pair(froms[i], tos[i]);
255 }
256 return edges;
257}
258
272inline cudaGraphNodeType cuda_get_graph_node_type(cudaGraphNode_t node) {
273 cudaGraphNodeType type;
274 TF_CHECK_CUDA(
275 cudaGraphNodeGetType(node, &type), "failed to get native graph node type"
276 );
277 return type;
278}
279
280// ----------------------------------------------------------------------------
281// cudaTask Types
282// ----------------------------------------------------------------------------
283
287constexpr const char* to_string(cudaGraphNodeType type) {
288 switch (type) {
289 case cudaGraphNodeTypeKernel: return "Kernel";
290 case cudaGraphNodeTypeMemcpy: return "Memcpy";
291 case cudaGraphNodeTypeMemset: return "Memset";
292 case cudaGraphNodeTypeHost: return "Host";
293 case cudaGraphNodeTypeGraph: return "Graph";
294 case cudaGraphNodeTypeEmpty: return "Empty";
295 case cudaGraphNodeTypeWaitEvent: return "WaitEvent";
296 case cudaGraphNodeTypeEventRecord: return "EventRecord";
297 case cudaGraphNodeTypeExtSemaphoreSignal: return "ExtSemaphoreSignal";
298 case cudaGraphNodeTypeExtSemaphoreWait: return "ExtSemaphoreWait";
299 case cudaGraphNodeTypeMemAlloc: return "MemAlloc";
300 case cudaGraphNodeTypeMemFree: return "MemFree";
301 case cudaGraphNodeTypeConditional: return "Conditional";
302 default: return "undefined";
303 }
304}
305
306// ----------------------------------------------------------------------------
307// cudaTask
308// ----------------------------------------------------------------------------
309
315class cudaTask {
316
317 template <typename Creator, typename Deleter>
318 friend class cudaGraphBase;
319
320 template <typename Creator, typename Deleter>
321 friend class cudaGraphExecBase;
322
323 friend class cudaFlow;
324 friend class cudaFlowCapturer;
325 friend class cudaFlowCapturerBase;
326
327 friend std::ostream& operator << (std::ostream&, const cudaTask&);
328
329 public:
330
334 cudaTask() = default;
335
339 cudaTask(const cudaTask&) = default;
340
344 cudaTask& operator = (const cudaTask&) = default;
345
355 template <typename... Ts>
356 cudaTask& precede(Ts&&... tasks);
357
367 template <typename... Ts>
368 cudaTask& succeed(Ts&&... tasks);
369
373 size_t num_successors() const;
374
378 size_t num_predecessors() const;
379
383 auto type() const;
384
390 void dump(std::ostream& os) const;
391
392 private:
393
394 cudaTask(cudaGraph_t, cudaGraphNode_t);
395
396 cudaGraph_t _native_graph {nullptr};
397 cudaGraphNode_t _native_node {nullptr};
398};
399
400// Constructor
401inline cudaTask::cudaTask(cudaGraph_t native_graph, cudaGraphNode_t native_node) :
402 _native_graph {native_graph}, _native_node {native_node} {
403}
404
405// Function: precede
406template <typename... Ts>
408 (
410 _native_graph, &_native_node, &(tasks._native_node), 1
411 ), ...
412 );
413 return *this;
414}
415
416// Function: succeed
417template <typename... Ts>
419 (tasks.precede(*this), ...);
420 return *this;
421}
422
423// Function: num_predecessors
424inline size_t cudaTask::num_predecessors() const {
425 return cuda_graph_node_get_dependencies(_native_node, nullptr);
426}
427
428// Function: num_successors
429inline size_t cudaTask::num_successors() const {
430 return cuda_graph_node_get_dependent_nodes(_native_node, nullptr);
431}
432
433// Function: type
434inline auto cudaTask::type() const {
435 cudaGraphNodeType type;
436 cudaGraphNodeGetType(_native_node, &type);
437 return type;
438}
439
440// Function: dump
441inline void cudaTask::dump(std::ostream& os) const {
442 os << "cudaTask [type=" << to_string(type()) << ']';
443}
444
448inline std::ostream& operator << (std::ostream& os, const cudaTask& ct) {
449 ct.dump(os);
450 return os;
451}
452
453// ----------------------------------------------------------------------------
454// cudaGraph
455// ----------------------------------------------------------------------------
456
466
467 public:
468
478 cudaGraph_t operator () () const {
479 cudaGraph_t g;
480 TF_CHECK_CUDA(cudaGraphCreate(&g, 0), "failed to create a CUDA native graph");
481 return g;
482 }
483
487 cudaGraph_t operator () (cudaGraph_t graph) const {
488 return graph;
489 }
490
491};
492
503
504 public:
505
513 void operator () (cudaGraph_t g) const {
514 cudaGraphDestroy(g);
515 }
516};
517
518
530template <typename Creator, typename Deleter>
531class cudaGraphBase : public std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, cudaGraphDeleter> {
532
533 static_assert(std::is_pointer_v<cudaGraph_t>, "cudaGraph_t is not a pointer type");
534
535 public:
536
540 using base_type = std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, Deleter>;
541
549 template <typename... ArgsT>
550 explicit cudaGraphBase(ArgsT&& ... args) : base_type(
551 Creator{}(std::forward<ArgsT>(args)...), Deleter()
552 ) {
553 }
554
559
564
568 size_t num_nodes() const;
569
573 size_t num_edges() const;
574
578 bool empty() const;
579
585 void dump(std::ostream& os);
586
587 // ------------------------------------------------------------------------
588 // Graph building routines
589 // ------------------------------------------------------------------------
590
604
619 template <typename C>
620 cudaTask host(C&& callable, void* user_data);
621
636 template <typename F, typename... ArgsT>
637 cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args);
638
651 cudaTask memset(void* dst, int v, size_t count);
652
665 cudaTask memcpy(void* tgt, const void* src, size_t bytes);
666
679 template <typename T, std::enable_if_t<
680 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
681 >
682 cudaTask zero(T* dst, size_t count);
683
699 template <typename T, std::enable_if_t<
700 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
701 >
702 cudaTask fill(T* dst, T value, size_t count);
703
718 template <typename T,
719 std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
720 >
721 cudaTask copy(T* tgt, const T* src, size_t num);
722
723 // ------------------------------------------------------------------------
724 // generic algorithms
725 // ------------------------------------------------------------------------
726
736 template <typename C>
738
760 template <typename I, typename C, typename E = cudaDefaultExecutionPolicy>
761 cudaTask for_each(I first, I last, C callable);
762
791 template <typename I, typename C, typename E = cudaDefaultExecutionPolicy>
792 cudaTask for_each_index(I first, I last, I step, C callable);
793
817 template <typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
818 cudaTask transform(I first, I last, O output, C op);
819
846 template <typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
847 cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op);
848
849 private:
850
851 cudaGraphBase(const cudaGraphBase&) = delete;
852 cudaGraphBase& operator = (const cudaGraphBase&) = delete;
853};
854
855// query the number of nodes
856template <typename Creator, typename Deleter>
858 size_t n;
859 TF_CHECK_CUDA(
860 cudaGraphGetNodes(this->get(), nullptr, &n),
861 "failed to get native graph nodes"
862 );
863 return n;
864}
865
866// query the emptiness
867template <typename Creator, typename Deleter>
869 return num_nodes() == 0;
870}
871
872// query the number of edges
873template <typename Creator, typename Deleter>
875 return cuda_graph_get_num_edges(this->get());
876}
877
879//inline void cudaGraph::dump(std::ostream& os) {
880//
881// // acquire the native handle
882// auto g = this->get();
883//
884// os << "digraph cudaGraph {\n";
885//
886// std::stack<std::tuple<cudaGraph_t, cudaGraphNode_t, int>> stack;
887// stack.push(std::make_tuple(g, nullptr, 1));
888//
889// int pl = 0;
890//
891// while(stack.empty() == false) {
892//
893// auto [graph, parent, l] = stack.top();
894// stack.pop();
895//
896// for(int i=0; i<pl-l+1; i++) {
897// os << "}\n";
898// }
899//
900// os << "subgraph cluster_p" << graph << " {\n"
901// << "label=\"cudaGraph-L" << l << "\";\n"
902// << "color=\"purple\";\n";
903//
904// auto nodes = cuda_graph_get_nodes(graph);
905// auto edges = cuda_graph_get_edges(graph);
906//
907// for(auto& [from, to] : edges) {
908// os << 'p' << from << " -> " << 'p' << to << ";\n";
909// }
910//
911// for(auto& node : nodes) {
912// auto type = cuda_get_graph_node_type(node);
913// if(type == cudaGraphNodeTypeGraph) {
914//
915// cudaGraph_t child_graph;
916// TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &child_graph), "");
917// stack.push(std::make_tuple(child_graph, node, l+1));
918//
919// os << 'p' << node << "["
920// << "shape=folder, style=filled, fontcolor=white, fillcolor=purple, "
921// << "label=\"cudaGraph-L" << l+1
922// << "\"];\n";
923// }
924// else {
925// os << 'p' << node << "[label=\""
926// << to_string(type)
927// << "\"];\n";
928// }
929// }
930//
931// // precede to parent
932// if(parent != nullptr) {
933// std::unordered_set<cudaGraphNode_t> successors;
934// for(const auto& p : edges) {
935// successors.insert(p.first);
936// }
937// for(auto node : nodes) {
938// if(successors.find(node) == successors.end()) {
939// os << 'p' << node << " -> " << 'p' << parent << ";\n";
940// }
941// }
942// }
943//
944// // set the previous level
945// pl = l;
946// }
947//
948// for(int i=0; i<=pl; i++) {
949// os << "}\n";
950// }
951//}
952
953// dump the graph
954template <typename Creator, typename Deleter>
956
957 // Generate a unique temporary filename in the system's temp directory using filesystem
958 auto temp_path = std::filesystem::temp_directory_path() / "graph_";
959 std::random_device rd;
960 std::uniform_int_distribution<int> dist(100000, 999999); // Generates a random number
961 temp_path += std::to_string(dist(rd)) + ".dot";
962
963 // Call the original function with the temporary file
964 TF_CHECK_CUDA(cudaGraphDebugDotPrint(this->get(), temp_path.string().c_str(), 0), "");
965
966 // Read the file and write to the output stream
967 std::ifstream file(temp_path);
968 if (file) {
969 os << file.rdbuf(); // Copy file contents to the stream
970 file.close();
971 std::filesystem::remove(temp_path); // Clean up the temporary file
972 } else {
973 TF_THROW("failed to open ", temp_path, " for dumping the CUDA graph");
974 }
975}
976
977// Function: noop
978template <typename Creator, typename Deleter>
980
981 cudaGraphNode_t node;
982
983 TF_CHECK_CUDA(
984 cudaGraphAddEmptyNode(&node, this->get(), nullptr, 0),
985 "failed to create a no-operation (empty) node"
986 );
987
988 return cudaTask(this->get(), node);
989}
990
991// Function: host
992template <typename Creator, typename Deleter>
993template <typename C>
994cudaTask cudaGraphBase<Creator, Deleter>::host(C&& callable, void* user_data) {
995
996 cudaGraphNode_t node;
997 cudaHostNodeParams p {callable, user_data};
998
999 TF_CHECK_CUDA(
1000 cudaGraphAddHostNode(&node, this->get(), nullptr, 0, &p),
1001 "failed to create a host node"
1002 );
1003
1004 return cudaTask(this->get(), node);
1005}
1006
1007// Function: kernel
1008template <typename Creator, typename Deleter>
1009template <typename F, typename... ArgsT>
1011 dim3 g, dim3 b, size_t s, F f, ArgsT... args
1012) {
1013
1014 cudaGraphNode_t node;
1015 cudaKernelNodeParams p;
1016
1017 void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
1018
1019 p.func = (void*)f;
1020 p.gridDim = g;
1021 p.blockDim = b;
1022 p.sharedMemBytes = s;
1023 p.kernelParams = arguments;
1024 p.extra = nullptr;
1025
1026 TF_CHECK_CUDA(
1027 cudaGraphAddKernelNode(&node, this->get(), nullptr, 0, &p),
1028 "failed to create a kernel task"
1029 );
1030
1031 return cudaTask(this->get(), node);
1032}
1033
1034// Function: zero
1035template <typename Creator, typename Deleter>
1036template <typename T, std::enable_if_t<
1037 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
1038>
1040
1041 cudaGraphNode_t node;
1042 auto p = cuda_get_zero_parms(dst, count);
1043
1044 TF_CHECK_CUDA(
1045 cudaGraphAddMemsetNode(&node, this->get(), nullptr, 0, &p),
1046 "failed to create a memset (zero) task"
1047 );
1048
1049 return cudaTask(this->get(), node);
1050}
1051
1052// Function: fill
1053template <typename Creator, typename Deleter>
1054template <typename T, std::enable_if_t<
1055 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
1056>
1057cudaTask cudaGraphBase<Creator, Deleter>::fill(T* dst, T value, size_t count) {
1058
1059 cudaGraphNode_t node;
1060 auto p = cuda_get_fill_parms(dst, value, count);
1061 TF_CHECK_CUDA(
1062 cudaGraphAddMemsetNode(&node, this->get(), nullptr, 0, &p),
1063 "failed to create a memset (fill) task"
1064 );
1065
1066 return cudaTask(this->get(), node);
1067}
1068
1069// Function: copy
1070template <typename Creator, typename Deleter>
1071template <
1072 typename T,
1073 std::enable_if_t<!std::is_same_v<T, void>, void>*
1074>
1075cudaTask cudaGraphBase<Creator, Deleter>::copy(T* tgt, const T* src, size_t num) {
1076
1077 cudaGraphNode_t node;
1078 auto p = cuda_get_copy_parms(tgt, src, num);
1079
1080 TF_CHECK_CUDA(
1081 cudaGraphAddMemcpyNode(&node, this->get(), nullptr, 0, &p),
1082 "failed to create a memcpy (copy) task"
1083 );
1084
1085 return cudaTask(this->get(), node);
1086}
1087
1088// Function: memset
1089template <typename Creator, typename Deleter>
1090cudaTask cudaGraphBase<Creator, Deleter>::memset(void* dst, int ch, size_t count) {
1091
1092 cudaGraphNode_t node;
1093 auto p = cuda_get_memset_parms(dst, ch, count);
1094
1095 TF_CHECK_CUDA(
1096 cudaGraphAddMemsetNode(&node, this->get(), nullptr, 0, &p),
1097 "failed to create a memset task"
1098 );
1099
1100 return cudaTask(this->get(), node);
1101}
1102
1103// Function: memcpy
1104template <typename Creator, typename Deleter>
1105cudaTask cudaGraphBase<Creator, Deleter>::memcpy(void* tgt, const void* src, size_t bytes) {
1106
1107 cudaGraphNode_t node;
1108 auto p = cuda_get_memcpy_parms(tgt, src, bytes);
1109
1110 TF_CHECK_CUDA(
1111 cudaGraphAddMemcpyNode(&node, this->get(), nullptr, 0, &p),
1112 "failed to create a memcpy task"
1113 );
1114
1115 return cudaTask(this->get(), node);
1116}
1117
1118
1119
1120
1121
1122} // end of namespace tf -----------------------------------------------------
1123
1124
1125
1126
cudaTask copy(T *tgt, const T *src, size_t num)
creates a memcopy task that copies typed data
Definition cuda_graph.hpp:1075
size_t num_edges() const
queries the number of edges in a native CUDA graph
Definition cuda_graph.hpp:874
cudaTask for_each(I first, I last, C callable)
applies a callable to each dereferenced element of the data array
Definition for_each.hpp:53
cudaTask memset(void *dst, int v, size_t count)
creates a memset task that fills untyped data with a byte value
Definition cuda_graph.hpp:1090
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)
creates a kernel task
Definition cuda_graph.hpp:1010
cudaTask fill(T *dst, T value, size_t count)
creates a memset task that fills a typed memory block with a value
Definition cuda_graph.hpp:1057
cudaGraphBase(cudaGraphBase &&)=default
constructs a cudaGraph from the given rhs using move semantics
cudaTask host(C &&callable, void *user_data)
creates a host task that runs a callable on the host
Definition cuda_graph.hpp:994
bool empty() const
queries if the graph is empty
Definition cuda_graph.hpp:868
cudaTask memcpy(void *tgt, const void *src, size_t bytes)
creates a memcpy task that copies untyped data in bytes
Definition cuda_graph.hpp:1105
cudaGraphBase(ArgsT &&... args)
constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator
Definition cuda_graph.hpp:550
std::unique_ptr< std::remove_pointer_t< cudaGraph_t >, Deleter > base_type
base std::unique_ptr type
Definition cuda_graph.hpp:540
cudaTask transform(I first, I last, O output, C op)
applies a callable to a source range and stores the result in a target range
Definition transform.hpp:65
cudaTask zero(T *dst, size_t count)
creates a memset task that sets a typed memory block to zero
Definition cuda_graph.hpp:1039
cudaTask single_task(C c)
runs a callable with only a single kernel thread
void dump(std::ostream &os)
dumps the CUDA graph to a DOT format through the given output stream
Definition cuda_graph.hpp:955
cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op)
creates a task to perform parallel transforms over two ranges of items
Definition transform.hpp:79
cudaTask for_each_index(I first, I last, I step, C callable)
applies a callable to each index in the range with the step size
Definition for_each.hpp:79
size_t num_nodes() const
queries the number of nodes in a native CUDA graph
Definition cuda_graph.hpp:857
cudaGraphBase & operator=(cudaGraphBase &&)=default
assign the rhs to *this using move semantics
cudaTask noop()
creates a no-operation task
Definition cuda_graph.hpp:979
class to create functors that construct CUDA graphs
Definition cuda_graph.hpp:465
cudaGraph_t operator()() const
creates a new CUDA graph
Definition cuda_graph.hpp:478
class to create a functor that deletes a CUDA graph
Definition cuda_graph.hpp:502
void operator()(cudaGraph_t g) const
deletes a CUDA graph
Definition cuda_graph.hpp:513
class to create a task handle of a CUDA Graph node
Definition cuda_graph.hpp:315
cudaTask(const cudaTask &)=default
copy-constructs a cudaTask
cudaTask & succeed(Ts &&... tasks)
adds precedence links from other tasks to this
Definition cuda_graph.hpp:418
friend std::ostream & operator<<(std::ostream &, const cudaTask &)
overload of ostream inserter operator for cudaTask
Definition cuda_graph.hpp:448
size_t num_predecessors() const
queries the number of dependents
Definition cuda_graph.hpp:424
size_t num_successors() const
queries the number of successors
Definition cuda_graph.hpp:429
cudaTask()=default
constructs an empty cudaTask
auto type() const
queries the type of this task
Definition cuda_graph.hpp:434
cudaTask & operator=(const cudaTask &)=default
copy-assigns a cudaTask
cudaTask & precede(Ts &&... tasks)
adds precedence links from this to other tasks
Definition cuda_graph.hpp:407
void dump(std::ostream &os) const
dumps the task through an output stream
Definition cuda_graph.hpp:441
taskflow namespace
Definition small_vector.hpp:20
const char * to_string(TaskType type)
convert a task type to a human-readable string
Definition task.hpp:66
cudaMemsetParams cuda_get_zero_parms(T *dst, size_t count)
gets the memset node parameter of a zero task (typed)
Definition cuda_graph.hpp:114
std::vector< cudaGraphNode_t > cuda_graph_get_root_nodes(cudaGraph_t graph)
acquires the root nodes in a native CUDA graph
Definition cuda_graph.hpp:234
size_t cuda_graph_node_get_dependencies(cudaGraphNode_t node, cudaGraphNode_t *dependencies)
Handles compatibility with CUDA <= 12.x and CUDA 13.
Definition cuda_graph.hpp:170
size_t cuda_graph_node_get_dependent_nodes(cudaGraphNode_t node, cudaGraphNode_t *dependent_nodes)
Handles compatibility with CUDA <= 12.x and CUDA 13.
Definition cuda_graph.hpp:185
std::vector< cudaGraphNode_t > cuda_graph_get_nodes(cudaGraph_t graph)
acquires the nodes in a native CUDA graph
Definition cuda_graph.hpp:221
cudaMemcpy3DParms cuda_get_memcpy_parms(void *tgt, const void *src, size_t bytes)
gets the memcpy node parameter of a memcpy task (untyped)
Definition cuda_graph.hpp:44
size_t cuda_graph_get_num_nodes(cudaGraph_t graph)
queries the number of nodes in a native CUDA graph
Definition cuda_graph.hpp:142
size_t cuda_graph_get_num_root_nodes(cudaGraph_t graph)
queries the number of root nodes in a native CUDA graph
Definition cuda_graph.hpp:130
cudaMemsetParams cuda_get_memset_parms(void *dst, int ch, size_t count)
gets the memset node parameter of a memcpy task (untyped)
Definition cuda_graph.hpp:69
cudaMemsetParams cuda_get_fill_parms(T *dst, T value, size_t count)
gets the memset node parameter of a fill task (typed)
Definition cuda_graph.hpp:90
std::ostream & operator<<(std::ostream &os, const Task &task)
overload of ostream inserter operator for Task
Definition task.hpp:1221
size_t cuda_graph_get_num_edges(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to)
Handles compatibility with CUDA <= 12.x and CUDA == 13.x.
Definition cuda_graph.hpp:154
cudaMemcpy3DParms cuda_get_copy_parms(T *tgt, const T *src, size_t num)
gets the memcpy node parameter of a copy task
Definition cuda_graph.hpp:23
void cuda_graph_add_dependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies)
Handles compatibility with CUDA <= 12.x and CUDA 13.
Definition cuda_graph.hpp:201
std::vector< std::pair< cudaGraphNode_t, cudaGraphNode_t > > cuda_graph_get_edges(cudaGraph_t graph)
acquires the edges in a native CUDA graph
Definition cuda_graph.hpp:248
cudaGraphNodeType cuda_get_graph_node_type(cudaGraphNode_t node)
queries the type of a native CUDA graph node
Definition cuda_graph.hpp:272