Loading...
Searching...
No Matches
cuda_graph_exec.hpp
1#pragma once
2
3#include "cuda_graph.hpp"
4
5
6namespace tf {
7
8// ----------------------------------------------------------------------------
9// cudaGraphExec
10// ----------------------------------------------------------------------------
11
20
21 public:
22
26 cudaGraphExec_t operator () () const {
27 return nullptr;
28 }
29
33 cudaGraphExec_t operator () (cudaGraphExec_t exec) const {
34 return exec;
35 }
36
40 cudaGraphExec_t operator () (cudaGraph_t graph) const {
41 cudaGraphExec_t exec;
42 TF_CHECK_CUDA(
43 cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0),
44 "failed to create an executable graph"
45 );
46 return exec;
47 }
48
52 template <typename C, typename D>
53 cudaGraphExec_t operator () (const cudaGraphBase<C, D>& graph) const {
54 return this->operator()(graph.get());
55 }
56};
57
66
67 public:
68
76 void operator () (cudaGraphExec_t executable) const {
77 cudaGraphExecDestroy(executable);
78 }
79};
80
92template <typename Creator, typename Deleter>
93class cudaGraphExecBase : public std::unique_ptr<std::remove_pointer_t<cudaGraphExec_t>, Deleter> {
94
95 static_assert(std::is_pointer_v<cudaGraphExec_t>, "cudaGraphExec_t is not a pointer type");
96
97 public:
98
102 using base_type = std::unique_ptr<std::remove_pointer_t<cudaGraphExec_t>, Deleter>;
103
111 template <typename... ArgsT>
112 explicit cudaGraphExecBase(ArgsT&& ... args) : base_type(
113 Creator{}(std::forward<ArgsT>(args)...), Deleter()
114 ) {}
115
120
125
126 // ----------------------------------------------------------------------------------------------
127 // Update Methods
128 // ----------------------------------------------------------------------------------------------
129
135 template <typename C>
136 void host(cudaTask task, C&& callable, void* user_data);
137
145 template <typename F, typename... ArgsT>
146 void kernel(
147 cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args
148 );
149
159 void memset(cudaTask task, void* dst, int ch, size_t count);
160
170 void memcpy(cudaTask task, void* tgt, const void* src, size_t bytes);
171
182 template <typename T, std::enable_if_t<
183 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
184 >
185 void zero(cudaTask task, T* dst, size_t count);
186
197 template <typename T, std::enable_if_t<
198 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
199 >
200 void fill(cudaTask task, T* dst, T value, size_t count);
201
211 template <typename T,
212 std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
213 >
214 void copy(cudaTask task, T* tgt, const T* src, size_t num);
215
216 //---------------------------------------------------------------------------
217 // Algorithm Primitives
218 //---------------------------------------------------------------------------
219
226 template <typename C>
227 void single_task(cudaTask task, C c);
228
232 template <typename I, typename C, typename E = cudaDefaultExecutionPolicy>
233 void for_each(cudaTask task, I first, I last, C callable);
234
238 template <typename I, typename C, typename E = cudaDefaultExecutionPolicy>
239 void for_each_index(cudaTask task, I first, I last, I step, C callable);
240
244 template <typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
245 void transform(cudaTask task, I first, I last, O output, C c);
246
250 template <typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
251 void transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c);
252
253
254 private:
255
256 cudaGraphExecBase(const cudaGraphExecBase&) = delete;
257
259};
260
261// ------------------------------------------------------------------------------------------------
262// update methods
263// ------------------------------------------------------------------------------------------------
264
265// Function: host
266template <typename Creator, typename Deleter>
267template <typename C>
268void cudaGraphExecBase<Creator, Deleter>::host(cudaTask task, C&& func, void* user_data) {
269 cudaHostNodeParams p {func, user_data};
270 TF_CHECK_CUDA(
271 cudaGraphExecHostNodeSetParams(this->get(), task._native_node, &p),
272 "failed to update kernel parameters on ", task
273 );
274}
275
276// Function: update kernel parameters
277template <typename Creator, typename Deleter>
278template <typename F, typename... ArgsT>
280 cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT... args
281) {
282 cudaKernelNodeParams p;
283
284 void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
285 p.func = (void*)f;
286 p.gridDim = g;
287 p.blockDim = b;
288 p.sharedMemBytes = s;
289 p.kernelParams = arguments;
290 p.extra = nullptr;
291
292 TF_CHECK_CUDA(
293 cudaGraphExecKernelNodeSetParams(this->get(), task._native_node, &p),
294 "failed to update kernel parameters on ", task
295 );
296}
297
298// Function: update copy parameters
299template <typename Creator, typename Deleter>
300template <typename T, std::enable_if_t<!std::is_same_v<T, void>, void>*>
301void cudaGraphExecBase<Creator, Deleter>::copy(cudaTask task, T* tgt, const T* src, size_t num) {
302 auto p = cuda_get_copy_parms(tgt, src, num);
303 TF_CHECK_CUDA(
304 cudaGraphExecMemcpyNodeSetParams(this->get(), task._native_node, &p),
305 "failed to update memcpy parameters on ", task
306 );
307}
308
309// Function: update memcpy parameters
310template <typename Creator, typename Deleter>
312 cudaTask task, void* tgt, const void* src, size_t bytes
313) {
314 auto p = cuda_get_memcpy_parms(tgt, src, bytes);
315
316 TF_CHECK_CUDA(
317 cudaGraphExecMemcpyNodeSetParams(this->get(), task._native_node, &p),
318 "failed to update memcpy parameters on ", task
319 );
320}
321
322// Procedure: memset
323template <typename Creator, typename Deleter>
324void cudaGraphExecBase<Creator, Deleter>::memset(cudaTask task, void* dst, int ch, size_t count) {
325 auto p = cuda_get_memset_parms(dst, ch, count);
326 TF_CHECK_CUDA(
327 cudaGraphExecMemsetNodeSetParams(this->get(), task._native_node, &p),
328 "failed to update memset parameters on ", task
329 );
330}
331
332// Procedure: fill
333template <typename Creator, typename Deleter>
334template <typename T, std::enable_if_t<
335 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
336>
337void cudaGraphExecBase<Creator, Deleter>::fill(cudaTask task, T* dst, T value, size_t count) {
338 auto p = cuda_get_fill_parms(dst, value, count);
339 TF_CHECK_CUDA(
340 cudaGraphExecMemsetNodeSetParams(this->get(), task._native_node, &p),
341 "failed to update memset parameters on ", task
342 );
343}
344
345// Procedure: zero
346template <typename Creator, typename Deleter>
347template <typename T, std::enable_if_t<
348 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
349>
350void cudaGraphExecBase<Creator, Deleter>::zero(cudaTask task, T* dst, size_t count) {
351 auto p = cuda_get_zero_parms(dst, count);
352 TF_CHECK_CUDA(
353 cudaGraphExecMemsetNodeSetParams(this->get(), task._native_node, &p),
354 "failed to update memset parameters on ", task
355 );
356}
357
358//-------------------------------------------------------------------------------------------------
359// forward declaration
360//-------------------------------------------------------------------------------------------------
361
365template <typename SC, typename SD>
367 TF_CHECK_CUDA(
368 cudaGraphLaunch(exec, this->get()), "failed to launch a CUDA executable graph"
369 );
370 return *this;
371}
372
376template <typename SC, typename SD>
377template <typename EC, typename ED>
379 return run(exec.get());
380}
381
382
383
384} // end of namespace tf -------------------------------------------------------------------------
class to create a CUDA graph with uunique ownership
Definition cuda_graph.hpp:531
class to create an executable CUDA graph with unique ownership
Definition cuda_graph_exec.hpp:93
void zero(cudaTask task, T *dst, size_t count)
updates parameters of a memset task to a zero task
Definition cuda_graph_exec.hpp:350
cudaGraphExecBase(ArgsT &&... args)
constructs a cudaGraphExec object by passing the given arguments to the executable CUDA graph creator
Definition cuda_graph_exec.hpp:112
void transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c)
updates parameters of a transform kernel task created from the CUDA graph of *this
Definition transform.hpp:108
cudaGraphExecBase & operator=(cudaGraphExecBase &&)=default
assign the rhs to *this using move semantics
cudaGraphExecBase(cudaGraphExecBase &&)=default
constructs a cudaGraphExec from the given rhs using move semantics
void kernel(cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args)
updates parameters of a kernel task
Definition cuda_graph_exec.hpp:279
void for_each_index(cudaTask task, I first, I last, I step, C callable)
updates parameters of a for_each_index kernel task created from the CUDA graph of *this
Definition for_each.hpp:92
std::unique_ptr< std::remove_pointer_t< cudaGraphExec_t >, Deleter > base_type
base std::unique_ptr type
Definition cuda_graph_exec.hpp:102
void host(cudaTask task, C &&callable, void *user_data)
updates parameters of a host task
Definition cuda_graph_exec.hpp:268
void transform(cudaTask task, I first, I last, O output, C c)
updates parameters of a transform kernel task created from the CUDA graph of *this
Definition transform.hpp:94
void memset(cudaTask task, void *dst, int ch, size_t count)
updates parameters of a memset task
Definition cuda_graph_exec.hpp:324
void single_task(cudaTask task, C c)
updates a single-threaded kernel task
void memcpy(cudaTask task, void *tgt, const void *src, size_t bytes)
updates parameters of a memcpy task
Definition cuda_graph_exec.hpp:311
void copy(cudaTask task, T *tgt, const T *src, size_t num)
updates parameters of a memcpy task to a copy task
Definition cuda_graph_exec.hpp:301
void for_each(cudaTask task, I first, I last, C callable)
updates parameters of a for_each kernel task created from the CUDA graph of *this
Definition for_each.hpp:66
void fill(cudaTask task, T *dst, T value, size_t count)
updates parameters of a memset task to a fill task
Definition cuda_graph_exec.hpp:337
class to create functors for constructing executable CUDA graphs
Definition cuda_graph_exec.hpp:19
cudaGraphExec_t operator()() const
returns a null executable CUDA graph
Definition cuda_graph_exec.hpp:26
class to create a functor for deleting an executable CUDA graph
Definition cuda_graph_exec.hpp:65
void operator()(cudaGraphExec_t executable) const
deletes an executable CUDA graph
Definition cuda_graph_exec.hpp:76
class to create a CUDA stream with unique ownership
Definition cuda_stream.hpp:189
cudaStreamBase(ArgsT &&... args)
constructs a cudaStream object by passing the given arguments to the stream creator
Definition cuda_stream.hpp:211
cudaStreamBase & run(const cudaGraphExecBase< C, D > &exec)
runs the given executable CUDA graph
class to create a task handle of a CUDA Graph node
Definition cuda_graph.hpp:315
taskflow namespace
Definition small_vector.hpp:20
cudaMemsetParams cuda_get_zero_parms(T *dst, size_t count)
gets the memset node parameter of a zero task (typed)
Definition cuda_graph.hpp:114
cudaMemcpy3DParms cuda_get_memcpy_parms(void *tgt, const void *src, size_t bytes)
gets the memcpy node parameter of a memcpy task (untyped)
Definition cuda_graph.hpp:44
cudaMemsetParams cuda_get_memset_parms(void *dst, int ch, size_t count)
gets the memset node parameter of a memcpy task (untyped)
Definition cuda_graph.hpp:69
cudaMemsetParams cuda_get_fill_parms(T *dst, T value, size_t count)
gets the memset node parameter of a fill task (typed)
Definition cuda_graph.hpp:90
cudaMemcpy3DParms cuda_get_copy_parms(T *tgt, const T *src, size_t num)
gets the memcpy node parameter of a copy task
Definition cuda_graph.hpp:23