tf namespace
taskflow namespace
Classes
-
template<typename T, unsigned N = 2>class SmallVector
- class to define a vector optimized for small array
- class Graph
- class to create a graph object
- class Runtime
- class to include a runtime object in a task
- struct TaskParams
- task parameters to use when creating an asynchronous task
- struct DefaultTaskParams
- empty task parameter type for compile-time optimization
-
template<typename T>class UnboundedTaskQueue
- class to create a lock-free unbounded single-producer multiple-consumer queue
-
template<typename T, size_t LogSize = TF_class BoundedTaskQueue
DEFAULT_ BOUNDED_ TASK_ QUEUE_ LOG_ SIZE> - class to create a lock-free bounded single-producer multiple-consumer queue
- class FlowBuilder
- class to build a task dependency graph
- class Subflow
- class to construct a subflow graph from the execution of a dynamic task
- class Worker
- class to create a worker in an executor
- class WorkerView
- class to create an immutable view of a worker in an executor
- class Executor
- class to create an executor
- class Task
- class to create a task handle over a node in a taskflow graph
- class TaskView
- class to access task information from the observer interface
- class AsyncTask
- class to create a dependent asynchronous task (async task)
- class Semaphore
- class to create a semophore object for building a concurrency constraint
- class Taskflow
- class to create a taskflow object
-
template<typename T>class Future
- class to access the result of an execution
- class ObserverInterface
- class to derive an executor observer
- class ChromeObserver
- class to create an observer based on Chrome tracing format
- class TFProfObserver
- class to create an observer based on the built-in taskflow profiler format
- struct DefaultClosureWrapper
- default closure wrapper that simply runs the given closure as is
-
template<typename C = DefaultClosureWrapper>class PartitionerBase
- class to derive a partitioner for scheduling parallel algorithms
-
template<typename C = DefaultClosureWrapper>class GuidedPartitioner
- class to construct a guided partitioner for scheduling parallel algorithms
-
template<typename C = DefaultClosureWrapper>class DynamicPartitioner
- class to construct a dynamic partitioner for scheduling parallel algorithms
-
template<typename C = DefaultClosureWrapper>class StaticPartitioner
- class to construct a static partitioner for scheduling parallel algorithms
-
template<typename C = DefaultClosureWrapper>class RandomPartitioner
- class to construct a random partitioner for scheduling parallel algorithms
- class Pipeflow
- class to create a pipeflow object used by the pipe callable
-
template<typename C = std::class Pipe
function<void(tf:: Pipeflow&)>> - class to create a pipe object for a pipeline stage
-
template<typename... Ps>class Pipeline
- class to create a pipeline scheduling framework
-
template<typename P>class ScalablePipeline
- class to create a scalable pipeline object
-
template<typename Input, typename Output, typename C>class DataPipe
- class to create a stage in a data-parallel pipeline
-
template<typename... Ps>class DataPipeline
- class to create a data-parallel pipeline scheduling framework
- class cudaScopedDevice
- class to create an RAII-styled context switch
-
template<typename T>class cudaDeviceAllocator
- class to create a CUDA device allocator
-
template<typename T>class cudaUSMAllocator
- class to create a unified shared memory (USM) allocator
- class cudaStream
- class to create an RAII-styled wrapper over a native CUDA stream
- class cudaEvent
- class to create an RAII-styled wrapper over a native CUDA event
- class cudaTask
- class to create a task handle over an internal node of a cudaFlow graph
- class cudaFlow
- class to create a cudaFlow task dependency graph
- class cudaFlowSequentialOptimizer
- class to capture a CUDA graph using a sequential stream
- class cudaFlowLinearOptimizer
- class to capture a linear CUDA graph using a sequential stream
- class cudaFlowRoundRobinOptimizer
- class to capture a CUDA graph using a round-robin algorithm
- class cudaFlowCapturer
- class to create a cudaFlow graph using stream capture
-
template<unsigned NT, unsigned VT>class cudaExecutionPolicy
- class to define execution policy for CUDA standard algorithms
Enums
- enum class TaskType: int { PLACEHOLDER = 0, STATIC, SUBFLOW, CONDITION, MODULE, ASYNC, UNDEFINED }
- enumeration of all task types
- enum class ObserverType: int { TFPROF = 0, CHROME, UNDEFINED }
- enumeration of all observer types
- enum class PartitionerType: int { STATIC, DYNAMIC }
- enumeration of all partitioner types
- enum class PipeType: int { PARALLEL = 1, SERIAL = 2 }
- enumeration of all pipe types
- enum class cudaTaskType: int { EMPTY = 0, HOST, MEMSET, MEMCPY, KERNEL, SUBFLOW, CAPTURE, UNDEFINED }
- enumeration of all cudaTask types
Typedefs
-
using observer_stamp_t = std::
chrono:: time_point<std:: chrono:: steady_clock> - default time point type of observers
- using DefaultPartitioner = GuidedPartitioner<>
- default partitioner set to tf::
GuidedPartitioner - using cudaDefaultExecutionPolicy = cudaExecutionPolicy<512, 7>
- default execution policy
Functions
- auto to_string(TaskType type) -> const char*
- convert a task type to a human-readable string
-
auto operator<<(std::
ostream& os, const Task& task) -> std:: ostream& - overload of ostream inserter operator for Task
-
template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>auto try_acquire(I first, I last) -> bool
- tries to acquire all semaphores in the specified range
-
template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>auto try_acquire(S && ... semaphores) -> bool
- tries to acquire all semaphores
-
template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>void release(I first, I last)
- tries to acquire all semaphores in the specified range
-
template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>void release(S && ... semaphores)
- tries to acquire all semaphores
- auto to_string(ObserverType type) -> const char*
- convert an observer type to a human-readable string
-
template<typename Input, typename Output, typename C>auto make_data_pipe(PipeType d, C&& callable) -> auto
- function to construct a data pipe (tf::
DataPipe) - auto cuda_get_num_devices() -> size_t
- queries the number of available devices
- auto cuda_get_device() -> int
- gets the current device associated with the caller thread
- void cuda_set_device(int id)
- switches to a given device context
- void cuda_get_device_property(int i, cudaDeviceProp& p)
- obtains the device property
- auto cuda_get_device_property(int i) -> cudaDeviceProp
- obtains the device property
-
void cuda_dump_device_property(std::
ostream& os, const cudaDeviceProp& p) - dumps the device property
- auto cuda_get_device_max_threads_per_block(int d) -> size_t
- queries the maximum threads per block on a device
- auto cuda_get_device_max_x_dim_per_block(int d) -> size_t
- queries the maximum x-dimension per block on a device
- auto cuda_get_device_max_y_dim_per_block(int d) -> size_t
- queries the maximum y-dimension per block on a device
- auto cuda_get_device_max_z_dim_per_block(int d) -> size_t
- queries the maximum z-dimension per block on a device
- auto cuda_get_device_max_x_dim_per_grid(int d) -> size_t
- queries the maximum x-dimension per grid on a device
- auto cuda_get_device_max_y_dim_per_grid(int d) -> size_t
- queries the maximum y-dimension per grid on a device
- auto cuda_get_device_max_z_dim_per_grid(int d) -> size_t
- queries the maximum z-dimension per grid on a device
- auto cuda_get_device_max_shm_per_block(int d) -> size_t
- queries the maximum shared memory size in bytes per block on a device
- auto cuda_get_device_warp_size(int d) -> size_t
- queries the warp size on a device
- auto cuda_get_device_compute_capability_major(int d) -> int
- queries the major number of compute capability of a device
- auto cuda_get_device_compute_capability_minor(int d) -> int
- queries the minor number of compute capability of a device
- auto cuda_get_device_unified_addressing(int d) -> bool
- queries if the device supports unified addressing
- auto cuda_get_driver_version() -> int
- queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
- auto cuda_get_runtime_version() -> int
- queries the CUDA Runtime version (1000 * major + 10 * minor)
- auto cuda_get_free_mem(int d) -> size_t
- queries the free memory (expensive call)
- auto cuda_get_total_mem(int d) -> size_t
- queries the total available memory (expensive call)
-
template<typename T>auto cuda_malloc_device(size_t N, int d) -> T*
- allocates memory on the given device for holding
N
elements of typeT
-
template<typename T>auto cuda_malloc_device(size_t N) -> T*
- allocates memory on the current device associated with the caller
-
template<typename T>auto cuda_malloc_shared(size_t N) -> T*
- allocates shared memory for holding
N
elements of typeT
-
template<typename T>void cuda_free(T* ptr, int d)
- frees memory on the GPU device
-
template<typename T>void cuda_free(T* ptr)
- frees memory on the GPU device
- void cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)
- copies data between host and device asynchronously through a stream
- void cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)
- initializes or sets GPU memory to the given value byte by byte
- auto to_string(cudaTaskType type) -> const char* constexpr
- convert a cuda_task type to a human-readable string
-
auto operator<<(std::
ostream& os, const cudaTask& ct) -> std:: ostream& - overload of ostream inserter operator for cudaTask
-
template<typename P, typename C>void cuda_single_task(P&& p, C c)
- runs a callable asynchronously using one kernel thread
-
template<typename P, typename I, typename C>void cuda_for_each(P&& p, I first, I last, C c)
- performs asynchronous parallel iterations over a range of items
-
template<typename P, typename I, typename C>void cuda_for_each_index(P&& p, I first, I last, I inc, C c)
- performs asynchronous parallel iterations over an index-based range of items
-
template<typename P, typename I, typename O, typename C>void cuda_transform(P&& p, I first, I last, O output, C op)
- performs asynchronous parallel transforms over a range of items
-
template<typename P, typename I1, typename I2, typename O, typename C>void cuda_transform(P&& p, I1 first1, I1 last1, I2 first2, O output, C op)
- performs asynchronous parallel transforms over two ranges of items
-
template<typename P, typename I, typename T, typename O>void cuda_reduce(P&& p, I first, I last, T* res, O op, void* buf)
- performs asynchronous parallel reduction over a range of items
-
template<typename P, typename I, typename T, typename O>void cuda_uninitialized_reduce(P&& p, I first, I last, T* res, O op, void* buf)
- performs asynchronous parallel reduction over a range of items without an initial value
-
template<typename P, typename I, typename T, typename O, typename U>void cuda_transform_reduce(P&& p, I first, I last, T* res, O bop, U uop, void* buf)
- performs asynchronous parallel reduction over a range of transformed items without an initial value
-
template<typename P, typename I, typename T, typename O, typename U>void cuda_uninitialized_transform_reduce(P&& p, I first, I last, T* res, O bop, U uop, void* buf)
- performs asynchronous parallel reduction over a range of transformed items with an initial value
-
template<typename P, typename I, typename O, typename C>void cuda_inclusive_scan(P&& p, I first, I last, O output, C op, void* buf)
- performs asynchronous inclusive scan over a range of items
-
template<typename P, typename I, typename O, typename C, typename U>void cuda_transform_inclusive_scan(P&& p, I first, I last, O output, C bop, U uop, void* buf)
- performs asynchronous inclusive scan over a range of transformed items
-
template<typename P, typename I, typename O, typename C>void cuda_exclusive_scan(P&& p, I first, I last, O output, C op, void* buf)
- performs asynchronous exclusive scan over a range of items
-
template<typename P, typename I, typename O, typename C, typename U>void cuda_transform_exclusive_scan(P&& p, I first, I last, O output, C bop, U uop, void* buf)
- performs asynchronous exclusive scan over a range of items
-
template<typename P, typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>void cuda_merge_by_key(P&& p, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp, void* buf)
- performs asynchronous key-value merge over a range of keys and values
-
template<typename P, typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C>void cuda_merge(P&& p, a_keys_it a_keys_first, a_keys_it a_keys_last, b_keys_it b_keys_first, b_keys_it b_keys_last, c_keys_it c_keys_first, C comp, void* buf)
- performs asynchronous key-only merge over a range of keys
-
template<typename P, typename K, typename V = cudaEmpty>auto cuda_sort_buffer_size(unsigned count) -> unsigned
- queries the buffer size in bytes needed to call sort kernels for the given number of elements
-
template<typename P, typename K_it, typename V_it, typename C>void cuda_sort_by_key(P&& p, K_it k_first, K_it k_last, V_it v_first, C comp, void* buf)
- performs asynchronous key-value sort on a range of items
-
template<typename P, typename K_it, typename C>void cuda_sort(P&& p, K_it k_first, K_it k_last, C comp, void* buf)
- performs asynchronous key-only sort on a range of items
-
template<typename P, typename I, typename U>void cuda_find_if(P&& p, I first, I last, unsigned* idx, U op)
- finds the index of the first element that satisfies the given criteria
-
template<typename P, typename I, typename O>void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf)
- finds the index of the minimum element in a range
-
template<typename P, typename I, typename O>void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf)
- finds the index of the maximum element in a range
- auto version() -> const char* constexpr
- queries the version information in a string format
major.minor.patch
Variables
-
template<typename P>bool is_task_params_v constexpr
- determines if the given type is a task parameter type
-
std::
array<TaskType, 6> TASK_TYPES constexpr - array of all task types (used for iterating task types)
-
template<typename C>bool is_subflow_task_v constexpr
- determines if a callable is a dynamic task
-
template<typename C>bool is_condition_task_v constexpr
- determines if a callable is a condition task
-
template<typename C>bool is_multi_condition_task_v constexpr
- determines if a callable is a multi-condition task
-
template<typename C>bool is_static_task_v constexpr
- determines if a callable is a static task
-
template<typename P>bool is_partitioner_v constexpr
- determines if a type is a partitioner
Enum documentation
enum class tf:: TaskType: int
#include <taskflow/core/task.hpp>
enumeration of all task types
Enumerators | |
---|---|
PLACEHOLDER |
placeholder task type |
STATIC |
static task type |
SUBFLOW |
dynamic (subflow) task type |
CONDITION |
condition task type |
MODULE |
module task type |
ASYNC |
asynchronous task type |
UNDEFINED |
undefined task type (for internal use only) |
enum class tf:: ObserverType: int
#include <taskflow/core/observer.hpp>
enumeration of all observer types
enum class tf:: PartitionerType: int
#include <taskflow/algorithm/partitioner.hpp>
enumeration of all partitioner types
Enumerators | |
---|---|
STATIC |
static partitioner type |
DYNAMIC |
dynamic partitioner type |
enum class tf:: PipeType: int
#include <taskflow/algorithm/pipeline.hpp>
enumeration of all pipe types
Enumerators | |
---|---|
PARALLEL |
parallel type |
SERIAL |
serial type |
enum class tf:: cudaTaskType: int
#include <taskflow/cuda/cuda_task.hpp>
enumeration of all cudaTask types
Enumerators | |
---|---|
EMPTY |
empty task type |
HOST |
host task type |
MEMSET |
memory set task type |
MEMCPY |
memory copy task type |
KERNEL |
memory copy task type |
SUBFLOW |
subflow (child graph) task type |
CAPTURE |
capture task type |
UNDEFINED |
undefined task type |
Typedef documentation
using tf:: observer_stamp_t = std:: chrono:: time_point<std:: chrono:: steady_clock>
#include <taskflow/core/observer.hpp>
default time point type of observers
using tf:: DefaultPartitioner = GuidedPartitioner<>
#include <taskflow/algorithm/partitioner.hpp>
default partitioner set to tf::
Guided partitioning algorithm can achieve stable and decent performance for most parallel algorithms.
using tf:: cudaDefaultExecutionPolicy = cudaExecutionPolicy<512, 7>
default execution policy
Function documentation
const char* tf:: to_string(TaskType type)
#include <taskflow/core/task.hpp>
convert a task type to a human-readable string
The name of each task type is the litte-case string of its characters.
TaskType::PLACEHOLDER -> "placeholder" TaskType::STATIC -> "static" TaskType::SUBFLOW -> "subflow" TaskType::CONDITION -> "condition" TaskType::MODULE -> "module" TaskType::ASYNC -> "async"
std:: ostream& tf:: operator<<(std:: ostream& os,
const Task& task)
#include <taskflow/core/task.hpp>
overload of ostream inserter operator for Task
#include <taskflow/core/semaphore.hpp>
template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>
bool tf:: try_acquire(I first,
I last)
tries to acquire all semaphores in the specified range
Template parameters | |
---|---|
I | iterator type |
Parameters | |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
Returns | true if all semaphores are acquired, otherwise false |
Tries to acquire all semaphores in the specified range.
#include <taskflow/core/semaphore.hpp>
template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>
bool tf:: try_acquire(S && ... semaphores)
tries to acquire all semaphores
Parameters | |
---|---|
semaphores | semaphores to acquire |
Returns | true if all semaphores are acquired, otherwise false |
Tries to acquire all the semaphores.
#include <taskflow/core/semaphore.hpp>
template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>
void tf:: release(I first,
I last)
tries to acquire all semaphores in the specified range
Template parameters | |
---|---|
I | iterator type |
Parameters | |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
Releases all the semaphores in the given range.
#include <taskflow/core/semaphore.hpp>
template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>
void tf:: release(S && ... semaphores)
tries to acquire all semaphores
Parameters | |
---|---|
semaphores | semaphores to release |
Releases all the semaphores.
const char* tf:: to_string(ObserverType type)
#include <taskflow/core/observer.hpp>
convert an observer type to a human-readable string
template<typename Input, typename Output, typename C>
auto tf:: make_data_pipe(PipeType d,
C&& callable)
function to construct a data pipe (tf::
Template parameters | |
---|---|
Input | input data type |
Output | output data type |
C | callable type |
tf::
tf::make_data_pipe<int, std::string>( tf::PipeType::SERIAL, [](int& input) { return std::to_string(input + 100); } );
The callable can additionally take a reference of tf::
tf::make_data_pipe<int, std::string>( tf::PipeType::SERIAL, [](int& input, tf::Pipeflow& pf) { printf("token=%lu, line=%lu\n", pf.token(), pf.line()); return std::to_string(input + 100); } );
size_t tf:: cuda_get_num_devices()
#include <taskflow/cuda/cuda_device.hpp>
queries the number of available devices
int tf:: cuda_get_device()
#include <taskflow/cuda/cuda_device.hpp>
gets the current device associated with the caller thread
void tf:: cuda_set_device(int id)
#include <taskflow/cuda/cuda_device.hpp>
switches to a given device context
void tf:: cuda_get_device_property(int i,
cudaDeviceProp& p)
#include <taskflow/cuda/cuda_device.hpp>
obtains the device property
cudaDeviceProp tf:: cuda_get_device_property(int i)
#include <taskflow/cuda/cuda_device.hpp>
obtains the device property
void tf:: cuda_dump_device_property(std:: ostream& os,
const cudaDeviceProp& p)
#include <taskflow/cuda/cuda_device.hpp>
dumps the device property
size_t tf:: cuda_get_device_max_threads_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum threads per block on a device
size_t tf:: cuda_get_device_max_x_dim_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum x-dimension per block on a device
size_t tf:: cuda_get_device_max_y_dim_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum y-dimension per block on a device
size_t tf:: cuda_get_device_max_z_dim_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum z-dimension per block on a device
size_t tf:: cuda_get_device_max_x_dim_per_grid(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum x-dimension per grid on a device
size_t tf:: cuda_get_device_max_y_dim_per_grid(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum y-dimension per grid on a device
size_t tf:: cuda_get_device_max_z_dim_per_grid(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum z-dimension per grid on a device
size_t tf:: cuda_get_device_max_shm_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum shared memory size in bytes per block on a device
size_t tf:: cuda_get_device_warp_size(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the warp size on a device
int tf:: cuda_get_device_compute_capability_major(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the major number of compute capability of a device
int tf:: cuda_get_device_compute_capability_minor(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the minor number of compute capability of a device
bool tf:: cuda_get_device_unified_addressing(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries if the device supports unified addressing
int tf:: cuda_get_driver_version()
#include <taskflow/cuda/cuda_device.hpp>
queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
int tf:: cuda_get_runtime_version()
#include <taskflow/cuda/cuda_device.hpp>
queries the CUDA Runtime version (1000 * major + 10 * minor)
size_t tf:: cuda_get_free_mem(int d)
#include <taskflow/cuda/cuda_memory.hpp>
queries the free memory (expensive call)
size_t tf:: cuda_get_total_mem(int d)
#include <taskflow/cuda/cuda_memory.hpp>
queries the total available memory (expensive call)
#include <taskflow/cuda/cuda_memory.hpp>
template<typename T>
T* tf:: cuda_malloc_device(size_t N,
int d)
allocates memory on the given device for holding N
elements of type T
The function calls cudaMalloc
to allocate N*sizeof(T)
bytes of memory on the given device d
and returns a pointer to the starting address of the device memory.
#include <taskflow/cuda/cuda_memory.hpp>
template<typename T>
T* tf:: cuda_malloc_device(size_t N)
allocates memory on the current device associated with the caller
The function calls malloc_device from the current device associated with the caller.
#include <taskflow/cuda/cuda_memory.hpp>
template<typename T>
T* tf:: cuda_malloc_shared(size_t N)
allocates shared memory for holding N
elements of type T
The function calls cudaMallocManaged
to allocate N*sizeof(T)
bytes of memory and returns a pointer to the starting address of the shared memory.
#include <taskflow/cuda/cuda_memory.hpp>
template<typename T>
void tf:: cuda_free(T* ptr,
int d)
frees memory on the GPU device
Template parameters | |
---|---|
T | pointer type |
Parameters | |
ptr | device pointer to memory to free |
d | device context identifier |
This methods call cudaFree
to free the memory space pointed to by ptr
using the given device context.
#include <taskflow/cuda/cuda_memory.hpp>
template<typename T>
void tf:: cuda_free(T* ptr)
frees memory on the GPU device
Template parameters | |
---|---|
T | pointer type |
Parameters | |
ptr | device pointer to memory to free |
This methods call cudaFree
to free the memory space pointed to by ptr
using the current device context of the caller.
void tf:: cuda_memcpy_async(cudaStream_t stream,
void* dst,
const void* src,
size_t count)
#include <taskflow/cuda/cuda_memory.hpp>
copies data between host and device asynchronously through a stream
Parameters | |
---|---|
stream | stream identifier |
dst | destination memory address |
src | source memory address |
count | size in bytes to copy |
The method calls cudaMemcpyAsync
with the given stream
using cudaMemcpyDefault
to infer the memory space of the source and the destination pointers. The memory areas may not overlap.
void tf:: cuda_memset_async(cudaStream_t stream,
void* devPtr,
int value,
size_t count)
#include <taskflow/cuda/cuda_memory.hpp>
initializes or sets GPU memory to the given value byte by byte
Parameters | |
---|---|
stream | stream identifier |
devPtr | pointer to GPU memory |
value | value to set for each byte of the specified memory |
count | size in bytes to set |
The method calls cudaMemsetAsync
with the given stream
to fill the first count
bytes of the memory area pointed to by devPtr
with the constant byte value value
.
const char* tf:: to_string(cudaTaskType type) constexpr
#include <taskflow/cuda/cuda_task.hpp>
convert a cuda_task type to a human-readable string
std:: ostream& tf:: operator<<(std:: ostream& os,
const cudaTask& ct)
#include <taskflow/cuda/cuda_task.hpp>
overload of ostream inserter operator for cudaTask
template<typename P, typename C>
void tf:: cuda_single_task(P&& p,
C c)
runs a callable asynchronously using one kernel thread
Template parameters | |
---|---|
P | execution policy type |
C | closure type |
Parameters | |
p | execution policy |
c | closure to run by one kernel thread |
The function launches a single kernel thread to run the given callable through the stream in the execution policy object.
template<typename P, typename I, typename C>
void tf:: cuda_for_each(P&& p,
I first,
I last,
C c)
performs asynchronous parallel iterations over a range of items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
C | unary operator type |
Parameters | |
p | execution policy object |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
c | unary operator to apply to each dereferenced iterator |
This function is equivalent to a parallel execution of the following loop on a GPU:
for(auto itr = first; itr != last; itr++) { c(*itr); }
template<typename P, typename I, typename C>
void tf:: cuda_for_each_index(P&& p,
I first,
I last,
I inc,
C c)
performs asynchronous parallel iterations over an index-based range of items
Template parameters | |
---|---|
P | execution policy type |
I | input index type |
C | unary operator type |
Parameters | |
p | execution policy object |
first | index to the beginning of the range |
last | index to the end of the range |
inc | step size between successive iterations |
c | unary operator to apply to each index |
This function is equivalent to a parallel execution of the following loop on a GPU:
// step is positive [first, last) for(auto i=first; i<last; i+=step) { c(i); } // step is negative [first, last) for(auto i=first; i>last; i+=step) { c(i); }
template<typename P, typename I, typename O, typename C>
void tf:: cuda_transform(P&& p,
I first,
I last,
O output,
C op)
performs asynchronous parallel transforms over a range of items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
O | output iterator type |
C | unary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
output | iterator to the beginning of the output range |
op | unary operator to apply to transform each item |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *output++ = op(*first++); }
template<typename P, typename I1, typename I2, typename O, typename C>
void tf:: cuda_transform(P&& p,
I1 first1,
I1 last1,
I2 first2,
O output,
C op)
performs asynchronous parallel transforms over two ranges of items
Template parameters | |
---|---|
P | execution policy type |
I1 | first input iterator type |
I2 | second input iterator type |
O | output iterator type |
C | binary operator type |
Parameters | |
p | execution policy |
first1 | iterator to the beginning of the first range |
last1 | iterator to the end of the first range |
first2 | iterator to the beginning of the second range |
output | iterator to the beginning of the output range |
op | binary operator to apply to transform each pair of items |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first1 != last1) { *output++ = op(*first1++, *first2++); }
#include <taskflow/cuda/algorithm/reduce.hpp>
template<typename P, typename I, typename T, typename O>
void tf:: cuda_reduce(P&& p,
I first,
I last,
T* res,
O op,
void* buf)
performs asynchronous parallel reduction over a range of items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
T | value type |
O | binary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
res | pointer to the result |
op | binary operator to apply to reduce elements |
buf | pointer to the temporary buffer |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *result = op(*result, *first++); }
#include <taskflow/cuda/algorithm/reduce.hpp>
template<typename P, typename I, typename T, typename O>
void tf:: cuda_uninitialized_reduce(P&& p,
I first,
I last,
T* res,
O op,
void* buf)
performs asynchronous parallel reduction over a range of items without an initial value
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
T | value type |
O | binary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
res | pointer to the result |
op | binary operator to apply to reduce elements |
buf | pointer to the temporary buffer |
This method is equivalent to the parallel execution of the following loop on a GPU:
*result = *first++; // no initial values partitipcate in the loop while (first != last) { *result = op(*result, *first++); }
#include <taskflow/cuda/algorithm/reduce.hpp>
template<typename P, typename I, typename T, typename O, typename U>
void tf:: cuda_transform_reduce(P&& p,
I first,
I last,
T* res,
O bop,
U uop,
void* buf)
performs asynchronous parallel reduction over a range of transformed items without an initial value
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
T | value type |
O | binary operator type |
U | unary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
res | pointer to the result |
bop | binary operator to apply to reduce elements |
uop | unary operator to apply to transform elements |
buf | pointer to the temporary buffer |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *result = bop(*result, uop(*first++)); }
#include <taskflow/cuda/algorithm/reduce.hpp>
template<typename P, typename I, typename T, typename O, typename U>
void tf:: cuda_uninitialized_transform_reduce(P&& p,
I first,
I last,
T* res,
O bop,
U uop,
void* buf)
performs asynchronous parallel reduction over a range of transformed items with an initial value
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
T | value type |
O | binary operator type |
U | unary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
res | pointer to the result |
bop | binary operator to apply to reduce elements |
uop | unary operator to apply to transform elements |
buf | pointer to the temporary buffer |
This method is equivalent to the parallel execution of the following loop on a GPU:
*result = uop(*first++); // no initial values partitipcate in the loop while (first != last) { *result = bop(*result, uop(*first++)); }
#include <taskflow/cuda/algorithm/scan.hpp>
template<typename P, typename I, typename O, typename C>
void tf:: cuda_inclusive_scan(P&& p,
I first,
I last,
O output,
C op,
void* buf)
performs asynchronous inclusive scan over a range of items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator |
O | output iterator |
C | binary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
op | binary operator to apply to scan |
buf | pointer to the temporary buffer |
#include <taskflow/cuda/algorithm/scan.hpp>
template<typename P, typename I, typename O, typename C, typename U>
void tf:: cuda_transform_inclusive_scan(P&& p,
I first,
I last,
O output,
C bop,
U uop,
void* buf)
performs asynchronous inclusive scan over a range of transformed items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator |
O | output iterator |
C | binary operator type |
U | unary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
bop | binary operator to apply to scan |
uop | unary operator to apply to transform each item before scan |
buf | pointer to the temporary buffer |
#include <taskflow/cuda/algorithm/scan.hpp>
template<typename P, typename I, typename O, typename C>
void tf:: cuda_exclusive_scan(P&& p,
I first,
I last,
O output,
C op,
void* buf)
performs asynchronous exclusive scan over a range of items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator |
O | output iterator |
C | binary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
op | binary operator to apply to scan |
buf | pointer to the temporary buffer |
#include <taskflow/cuda/algorithm/scan.hpp>
template<typename P, typename I, typename O, typename C, typename U>
void tf:: cuda_transform_exclusive_scan(P&& p,
I first,
I last,
O output,
C bop,
U uop,
void* buf)
performs asynchronous exclusive scan over a range of items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator |
O | output iterator |
C | binary operator type |
U | unary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
bop | binary operator to apply to scan |
uop | unary operator to apply to transform each item before scan |
buf | pointer to the temporary buffer |
#include <taskflow/cuda/algorithm/merge.hpp>
template<typename P, typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>
void tf:: cuda_merge_by_key(P&& p,
a_keys_it a_keys_first,
a_keys_it a_keys_last,
a_vals_it a_vals_first,
b_keys_it b_keys_first,
b_keys_it b_keys_last,
b_vals_it b_vals_first,
c_keys_it c_keys_first,
c_vals_it c_vals_first,
C comp,
void* buf)
performs asynchronous key-value merge over a range of keys and values
Template parameters | |
---|---|
P | execution policy type |
a_keys_it | first key iterator type |
a_vals_it | first value iterator type |
b_keys_it | second key iterator type |
b_vals_it | second value iterator type |
c_keys_it | output key iterator type |
c_vals_it | output value iterator type |
C | comparator type |
Parameters | |
p | execution policy |
a_keys_first | iterator to the beginning of the first key range |
a_keys_last | iterator to the end of the first key range |
a_vals_first | iterator to the beginning of the first value range |
b_keys_first | iterator to the beginning of the second key range |
b_keys_last | iterator to the end of the second key range |
b_vals_first | iterator to the beginning of the second value range |
c_keys_first | iterator to the beginning of the output key range |
c_vals_first | iterator to the beginning of the output value range |
comp | comparator |
buf | pointer to the temporary buffer |
Performs a key-value merge that copies elements from [a_keys_first, a_keys_last)
and [b_keys_first, b_keys_last)
into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))
such that the resulting range is in ascending key order.
At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first))
and [b_vals_first + (b_keys_last - b_keys_first))
into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))
such that the resulting range is in ascending order implied by each input element's associated key.
For example, assume:
a_keys
= {1, 8};a_vals
= {2, 1};b_keys
= {3, 7};b_vals
= {3, 4};
After the merge, we have:
c_keys
= {1, 3, 7, 8}c_vals
= {2, 3, 4, 1}
#include <taskflow/cuda/algorithm/merge.hpp>
template<typename P, typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C>
void tf:: cuda_merge(P&& p,
a_keys_it a_keys_first,
a_keys_it a_keys_last,
b_keys_it b_keys_first,
b_keys_it b_keys_last,
c_keys_it c_keys_first,
C comp,
void* buf)
performs asynchronous key-only merge over a range of keys
Template parameters | |
---|---|
P | execution policy type |
a_keys_it | first key iterator type |
b_keys_it | second key iterator type |
c_keys_it | output key iterator type |
C | comparator type |
Parameters | |
p | execution policy |
a_keys_first | iterator to the beginning of the first key range |
a_keys_last | iterator to the end of the first key range |
b_keys_first | iterator to the beginning of the second key range |
b_keys_last | iterator to the end of the second key range |
c_keys_first | iterator to the beginning of the output key range |
comp | comparator |
buf | pointer to the temporary buffer |
This function is equivalent to tf::
#include <taskflow/cuda/algorithm/sort.hpp>
template<typename P, typename K, typename V = cudaEmpty>
unsigned tf:: cuda_sort_buffer_size(unsigned count)
queries the buffer size in bytes needed to call sort kernels for the given number of elements
Template parameters | |
---|---|
P | execution policy type |
K | key type |
V | value type (default tf::cudaEmpty) |
Parameters | |
count | number of keys/values to sort |
The function is used to allocate a buffer for calling tf::
#include <taskflow/cuda/algorithm/sort.hpp>
template<typename P, typename K_it, typename V_it, typename C>
void tf:: cuda_sort_by_key(P&& p,
K_it k_first,
K_it k_last,
V_it v_first,
C comp,
void* buf)
performs asynchronous key-value sort on a range of items
Template parameters | |
---|---|
P | execution policy type |
K_it | key iterator type |
V_it | value iterator type |
C | comparator type |
Parameters | |
p | execution policy |
k_first | iterator to the beginning of the key range |
k_last | iterator to the end of the key range |
v_first | iterator to the beginning of the value range |
comp | binary comparator |
buf | pointer to the temporary buffer |
Sorts key-value elements in [k_first, k_last)
and [v_first, v_first + (k_last - k_first))
into ascending key order using the given comparator comp
. If i
and j
are any two valid iterators in [k_first, k_last)
such that i
precedes j
, and p
and q
are iterators in [v_first, v_first + (k_last - k_first))
corresponding to i
and j
respectively, then comp(*j, *i)
evaluates to false
.
For example, assume:
keys
are{1, 4, 2, 8, 5, 7}
values
are{'a', 'b', 'c', 'd', 'e', 'f'}
After sort:
keys
are{1, 2, 4, 5, 7, 8}
values
are{'a', 'c', 'b', 'e', 'f', 'd'}
#include <taskflow/cuda/algorithm/sort.hpp>
template<typename P, typename K_it, typename C>
void tf:: cuda_sort(P&& p,
K_it k_first,
K_it k_last,
C comp,
void* buf)
performs asynchronous key-only sort on a range of items
Template parameters | |
---|---|
P | execution policy type |
K_it | key iterator type |
C | comparator type |
Parameters | |
p | execution policy |
k_first | iterator to the beginning of the key range |
k_last | iterator to the end of the key range |
comp | binary comparator |
buf | pointer to the temporary buffer |
This method is equivalent to tf::
#include <taskflow/cuda/algorithm/find.hpp>
template<typename P, typename I, typename U>
void tf:: cuda_find_if(P&& p,
I first,
I last,
unsigned* idx,
U op)
finds the index of the first element that satisfies the given criteria
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
U | unary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | pointer to the index of the found element |
op | unary operator which returns true for the required element |
The function launches kernels asynchronously to find the index idx
of the first element in the range [first, last)
such that op(*(first+idx))
is true. This is equivalent to the parallel execution of the following loop:
unsigned idx = 0; for(; first != last; ++first, ++idx) { if (p(*first)) { return idx; } } return idx;
#include <taskflow/cuda/algorithm/find.hpp>
template<typename P, typename I, typename O>
void tf:: cuda_min_element(P&& p,
I first,
I last,
unsigned* idx,
O op,
void* buf)
finds the index of the minimum element in a range
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
O | comparator type |
Parameters | |
p | execution policy object |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | solution index of the minimum element |
op | comparison function object |
buf | pointer to the buffer |
The function launches kernels asynchronously to find the smallest element in the range [first, last)
using the given comparator op
. You need to provide a buffer that holds at least tf::cuda_min_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop:
if(first == last) { return 0; } auto smallest = first; for (++first; first != last; ++first) { if (op(*first, *smallest)) { smallest = first; } } return std::distance(first, smallest);
#include <taskflow/cuda/algorithm/find.hpp>
template<typename P, typename I, typename O>
void tf:: cuda_max_element(P&& p,
I first,
I last,
unsigned* idx,
O op,
void* buf)
finds the index of the maximum element in a range
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
O | comparator type |
Parameters | |
p | execution policy object |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | solution index of the maximum element |
op | comparison function object |
buf | pointer to the buffer |
The function launches kernels asynchronously to find the largest element in the range [first, last)
using the given comparator op
. You need to provide a buffer that holds at least tf::cuda_max_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop:
if(first == last) { return 0; } auto largest = first; for (++first; first != last; ++first) { if (op(*largest, *first)) { largest = first; } } return std::distance(first, largest);
const char* tf:: version() constexpr
#include <taskflow/taskflow.hpp>
queries the version information in a string format major.minor.patch
Release notes are available here: https:/
Variable documentation
#include <taskflow/core/graph.hpp>
template<typename P>
bool tf:: is_task_params_v constexpr
determines if the given type is a task parameter type
Task parameters can be specified in one of the following types:
- tf::
TaskParams: assign the struct of defined parameters - tf::
DefaultTaskParams: assign nothing - std::
string: assign a name to the task
std:: array<TaskType, 6> tf:: TASK_TYPES constexpr
#include <taskflow/core/task.hpp>
array of all task types (used for iterating task types)
#include <taskflow/core/task.hpp>
template<typename C>
bool tf:: is_subflow_task_v constexpr
determines if a callable is a dynamic task
A dynamic task is a callable object constructible from std::function<void(Subflow&)>.
#include <taskflow/core/task.hpp>
template<typename C>
bool tf:: is_condition_task_v constexpr
determines if a callable is a condition task
A condition task is a callable object constructible from std::function<int()> or std::function<int(tf::Runtime&)>.
#include <taskflow/core/task.hpp>
template<typename C>
bool tf:: is_multi_condition_task_v constexpr
determines if a callable is a multi-condition task
A multi-condition task is a callable object constructible from std::
#include <taskflow/core/task.hpp>
template<typename C>
bool tf:: is_static_task_v constexpr
determines if a callable is a static task
A static task is a callable object constructible from std::function<void()> or std::function<void(tf::Runtime&)>.
#include <taskflow/algorithm/partitioner.hpp>
template<typename P>
bool tf:: is_partitioner_v constexpr
determines if a type is a partitioner
A partitioner is a derived type from tf::