Loading...
Searching...
No Matches
transform.hpp
1#pragma once
2
3#include "../cudaflow.hpp"
4
9
10namespace tf {
11
12// ----------------------------------------------------------------------------
13// transform
14// ----------------------------------------------------------------------------
15
16namespace detail {
17
21template <typename I, typename O, typename C, typename E>
22__global__ void cuda_transform_kernel(I first, unsigned count, O output, C op) {
23 auto tid = threadIdx.x;
24 auto bid = blockIdx.x;
25 auto tile = cuda_get_tile(bid, E::nv, count);
26 cuda_strided_iterate<E::nt, E::vt>(
27 [=]__device__(auto, auto j) {
28 auto offset = j + tile.begin;
29 *(output + offset) = op(*(first+offset));
30 },
31 tid,
32 tile.count()
33 );
34}
35
39template <typename I1, typename I2, typename O, typename C, typename E>
40__global__ void cuda_transform_kernel(
41 I1 first1, I2 first2, unsigned count, O output, C op
42) {
43 auto tid = threadIdx.x;
44 auto bid = blockIdx.x;
45 auto tile = cuda_get_tile(bid, E::nv, count);
46 cuda_strided_iterate<E::nt, E::vt>(
47 [=]__device__(auto, auto j) {
48 auto offset = j + tile.begin;
49 *(output + offset) = op(*(first1+offset), *(first2+offset));
50 },
51 tid,
52 tile.count()
53 );
54}
55
56} // end of namespace detail -------------------------------------------------
57
58// ----------------------------------------------------------------------------
59// cudaFlow
60// ----------------------------------------------------------------------------
61
62// Function: transform
63template <typename Creator, typename Deleter>
64template <typename I, typename O, typename C, typename E>
66
67 unsigned count = std::distance(first, last);
68
69 return kernel(
70 E::num_blocks(count), E::nt, 0,
71 detail::cuda_transform_kernel<I, O, C, E>,
72 first, count, output, c
73 );
74}
75
76// Function: transform
77template <typename Creator, typename Deleter>
78template <typename I1, typename I2, typename O, typename C, typename E>
79cudaTask cudaGraphBase<Creator, Deleter>::transform(I1 first1, I1 last1, I2 first2, O output, C c) {
80
81 unsigned count = std::distance(first1, last1);
82
83 return kernel(
84 E::num_blocks(count), E::nt, 0,
85 detail::cuda_transform_kernel<I1, I2, O, C, E>,
86 first1, first2, count, output, c
87 );
88}
89
90
91// Function: update transform
92template <typename Creator, typename Deleter>
93template <typename I, typename O, typename C, typename E>
94void cudaGraphExecBase<Creator, Deleter>::transform(cudaTask task, I first, I last, O output, C c) {
95
96 unsigned count = std::distance(first, last);
97
98 kernel(task,
99 E::num_blocks(count), E::nt, 0,
100 detail::cuda_transform_kernel<I, O, C, E>,
101 first, count, output, c
102 );
103}
104
105// Function: update transform
106template <typename Creator, typename Deleter>
107template <typename I1, typename I2, typename O, typename C, typename E>
109 cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c
110) {
111 unsigned count = std::distance(first1, last1);
112
113 kernel(task,
114 E::num_blocks(count), E::nt, 0,
115 detail::cuda_transform_kernel<I1, I2, O, C, E>,
116 first1, first2, count, output, c
117 );
118}
119
120} // end of namespace tf -----------------------------------------------------
121
122
123
124
125
126
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)
creates a kernel task
Definition cuda_graph.hpp:1010
cudaTask transform(I first, I last, O output, C op)
applies a callable to a source range and stores the result in a target range
Definition transform.hpp:65
void kernel(cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT... args)
updates parameters of a kernel task
Definition cuda_graph_exec.hpp:279
void transform(cudaTask task, I first, I last, O output, C c)
updates parameters of a transform kernel task created from the CUDA graph of *this
Definition transform.hpp:94
class to create a task handle of a CUDA Graph node
Definition cuda_graph.hpp:315
taskflow namespace
Definition small_vector.hpp:20