3#include "../cudaflow.hpp"
21template <
typename I,
typename O,
typename C,
typename E>
22__global__
void cuda_transform_kernel(I first,
unsigned count, O output, C op) {
23 auto tid = threadIdx.x;
24 auto bid = blockIdx.x;
25 auto tile = cuda_get_tile(bid, E::nv, count);
26 cuda_strided_iterate<E::nt, E::vt>(
27 [=]__device__(
auto,
auto j) {
28 auto offset = j + tile.begin;
29 *(output + offset) = op(*(first+offset));
39template <
typename I1,
typename I2,
typename O,
typename C,
typename E>
40__global__
void cuda_transform_kernel(
41 I1 first1, I2 first2,
unsigned count, O output, C op
43 auto tid = threadIdx.x;
44 auto bid = blockIdx.x;
45 auto tile = cuda_get_tile(bid, E::nv, count);
46 cuda_strided_iterate<E::nt, E::vt>(
47 [=]__device__(
auto,
auto j) {
48 auto offset = j + tile.begin;
49 *(output + offset) = op(*(first1+offset), *(first2+offset));
63template <
typename Creator,
typename Deleter>
64template <
typename I,
typename O,
typename C,
typename E>
67 unsigned count = std::distance(first, last);
70 E::num_blocks(count), E::nt, 0,
71 detail::cuda_transform_kernel<I, O, C, E>,
72 first, count, output, c
77template <
typename Creator,
typename Deleter>
78template <
typename I1,
typename I2,
typename O,
typename C,
typename E>
81 unsigned count = std::distance(first1, last1);
84 E::num_blocks(count), E::nt, 0,
85 detail::cuda_transform_kernel<I1, I2, O, C, E>,
86 first1, first2, count, output, c
92template <
typename Creator,
typename Deleter>
93template <
typename I,
typename O,
typename C,
typename E>
96 unsigned count = std::distance(first, last);
99 E::num_blocks(count), E::nt, 0,
100 detail::cuda_transform_kernel<I, O, C, E>,
101 first, count, output, c
106template <
typename Creator,
typename Deleter>
107template <
typename I1,
typename I2,
typename O,
typename C,
typename E>
109 cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c
111 unsigned count = std::distance(first1, last1);
114 E::num_blocks(count), E::nt, 0,
115 detail::cuda_transform_kernel<I1, I2, O, C, E>,
116 first1, first2, count, output, c
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)
creates a kernel task
Definition cuda_graph.hpp:1010
cudaTask transform(I first, I last, O output, C op)
applies a callable to a source range and stores the result in a target range
Definition transform.hpp:65
void kernel(cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args)
updates parameters of a kernel task
Definition cuda_graph_exec.hpp:279
void transform(cudaTask task, I first, I last, O output, C c)
updates parameters of a transform kernel task created from the CUDA graph of *this
Definition transform.hpp:94
class to create a task handle of a CUDA Graph node
Definition cuda_graph.hpp:315
taskflow namespace
Definition small_vector.hpp:20