tf namespace

taskflow namespace

Classes

template<typename T, unsigned N = 2>
class SmallVector
class to define a vector optimized for small array
class Graph
class to create a graph object
class Runtime
class to include a runtime object in a task
struct TaskParams
task parameters to use when creating an asynchronous task
struct DefaultTaskParams
empty task parameter type for compile-time optimization
template<typename T>
class UnboundedTaskQueue
class to create a lock-free unbounded single-producer multiple-consumer queue
template<typename T, size_t LogSize = TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE>
class BoundedTaskQueue
class to create a lock-free bounded single-producer multiple-consumer queue
class FlowBuilder
class to build a task dependency graph
class Subflow
class to construct a subflow graph from the execution of a dynamic task
class Worker
class to create a worker in an executor
class WorkerView
class to create an immutable view of a worker in an executor
class Executor
class to create an executor
class Task
class to create a task handle over a node in a taskflow graph
class TaskView
class to access task information from the observer interface
class AsyncTask
class to create a dependent asynchronous task (async task)
class Semaphore
class to create a semophore object for building a concurrency constraint
class Taskflow
class to create a taskflow object
template<typename T>
class Future
class to access the result of an execution
class ObserverInterface
class to derive an executor observer
class ChromeObserver
class to create an observer based on Chrome tracing format
class TFProfObserver
class to create an observer based on the built-in taskflow profiler format
struct DefaultClosureWrapper
default closure wrapper that simply runs the given closure as is
template<typename C = DefaultClosureWrapper>
class PartitionerBase
class to derive a partitioner for scheduling parallel algorithms
template<typename C = DefaultClosureWrapper>
class GuidedPartitioner
class to construct a guided partitioner for scheduling parallel algorithms
template<typename C = DefaultClosureWrapper>
class DynamicPartitioner
class to construct a dynamic partitioner for scheduling parallel algorithms
template<typename C = DefaultClosureWrapper>
class StaticPartitioner
class to construct a static partitioner for scheduling parallel algorithms
template<typename C = DefaultClosureWrapper>
class RandomPartitioner
class to construct a random partitioner for scheduling parallel algorithms
class Pipeflow
class to create a pipeflow object used by the pipe callable
template<typename C = std::function<void(tf::Pipeflow&)>>
class Pipe
class to create a pipe object for a pipeline stage
template<typename... Ps>
class Pipeline
class to create a pipeline scheduling framework
template<typename P>
class ScalablePipeline
class to create a scalable pipeline object
template<typename Input, typename Output, typename C>
class DataPipe
class to create a stage in a data-parallel pipeline
template<typename... Ps>
class DataPipeline
class to create a data-parallel pipeline scheduling framework
class cudaScopedDevice
class to create an RAII-styled context switch
template<typename T>
class cudaDeviceAllocator
class to create a CUDA device allocator
template<typename T>
class cudaUSMAllocator
class to create a unified shared memory (USM) allocator
class cudaStream
class to create an RAII-styled wrapper over a native CUDA stream
class cudaEvent
class to create an RAII-styled wrapper over a native CUDA event
class cudaTask
class to create a task handle over an internal node of a cudaFlow graph
class cudaFlow
class to create a cudaFlow task dependency graph
class cudaFlowSequentialOptimizer
class to capture a CUDA graph using a sequential stream
class cudaFlowLinearOptimizer
class to capture a linear CUDA graph using a sequential stream
class cudaFlowRoundRobinOptimizer
class to capture a CUDA graph using a round-robin algorithm
class cudaFlowCapturer
class to create a cudaFlow graph using stream capture
template<unsigned NT, unsigned VT>
class cudaExecutionPolicy
class to define execution policy for CUDA standard algorithms

Enums

enum class TaskType: int { PLACEHOLDER = 0, STATIC, SUBFLOW, CONDITION, MODULE, ASYNC, UNDEFINED }
enumeration of all task types
enum class ObserverType: int { TFPROF = 0, CHROME, UNDEFINED }
enumeration of all observer types
enum class PartitionerType: int { STATIC, DYNAMIC }
enumeration of all partitioner types
enum class PipeType: int { PARALLEL = 1, SERIAL = 2 }
enumeration of all pipe types
enum class cudaTaskType: int { EMPTY = 0, HOST, MEMSET, MEMCPY, KERNEL, SUBFLOW, CAPTURE, UNDEFINED }
enumeration of all cudaTask types

Typedefs

using observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>
default time point type of observers
using DefaultPartitioner = GuidedPartitioner<>
default partitioner set to tf::GuidedPartitioner
using cudaDefaultExecutionPolicy = cudaExecutionPolicy<512, 7>
default execution policy

Functions

auto to_string(TaskType type) -> const char*
convert a task type to a human-readable string
auto operator<<(std::ostream& os, const Task& task) -> std::ostream&
overload of ostream inserter operator for Task
template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>
auto try_acquire(I first, I last) -> bool
tries to acquire all semaphores in the specified range
template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>
auto try_acquire(S && ... semaphores) -> bool
tries to acquire all semaphores
template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>
void release(I first, I last)
tries to acquire all semaphores in the specified range
template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>
void release(S && ... semaphores)
tries to acquire all semaphores
auto to_string(ObserverType type) -> const char*
convert an observer type to a human-readable string
template<typename Input, typename Output, typename C>
auto make_data_pipe(PipeType d, C&& callable) -> auto
function to construct a data pipe (tf::DataPipe)
auto cuda_get_num_devices() -> size_t
queries the number of available devices
auto cuda_get_device() -> int
gets the current device associated with the caller thread
void cuda_set_device(int id)
switches to a given device context
void cuda_get_device_property(int i, cudaDeviceProp& p)
obtains the device property
auto cuda_get_device_property(int i) -> cudaDeviceProp
obtains the device property
void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p)
dumps the device property
auto cuda_get_device_max_threads_per_block(int d) -> size_t
queries the maximum threads per block on a device
auto cuda_get_device_max_x_dim_per_block(int d) -> size_t
queries the maximum x-dimension per block on a device
auto cuda_get_device_max_y_dim_per_block(int d) -> size_t
queries the maximum y-dimension per block on a device
auto cuda_get_device_max_z_dim_per_block(int d) -> size_t
queries the maximum z-dimension per block on a device
auto cuda_get_device_max_x_dim_per_grid(int d) -> size_t
queries the maximum x-dimension per grid on a device
auto cuda_get_device_max_y_dim_per_grid(int d) -> size_t
queries the maximum y-dimension per grid on a device
auto cuda_get_device_max_z_dim_per_grid(int d) -> size_t
queries the maximum z-dimension per grid on a device
auto cuda_get_device_max_shm_per_block(int d) -> size_t
queries the maximum shared memory size in bytes per block on a device
auto cuda_get_device_warp_size(int d) -> size_t
queries the warp size on a device
auto cuda_get_device_compute_capability_major(int d) -> int
queries the major number of compute capability of a device
auto cuda_get_device_compute_capability_minor(int d) -> int
queries the minor number of compute capability of a device
auto cuda_get_device_unified_addressing(int d) -> bool
queries if the device supports unified addressing
auto cuda_get_driver_version() -> int
queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
auto cuda_get_runtime_version() -> int
queries the CUDA Runtime version (1000 * major + 10 * minor)
auto cuda_get_free_mem(int d) -> size_t
queries the free memory (expensive call)
auto cuda_get_total_mem(int d) -> size_t
queries the total available memory (expensive call)
template<typename T>
auto cuda_malloc_device(size_t N, int d) -> T*
allocates memory on the given device for holding N elements of type T
template<typename T>
auto cuda_malloc_device(size_t N) -> T*
allocates memory on the current device associated with the caller
template<typename T>
auto cuda_malloc_shared(size_t N) -> T*
allocates shared memory for holding N elements of type T
template<typename T>
void cuda_free(T* ptr, int d)
frees memory on the GPU device
template<typename T>
void cuda_free(T* ptr)
frees memory on the GPU device
void cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)
copies data between host and device asynchronously through a stream
void cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)
initializes or sets GPU memory to the given value byte by byte
auto to_string(cudaTaskType type) -> const char* constexpr
convert a cuda_task type to a human-readable string
auto operator<<(std::ostream& os, const cudaTask& ct) -> std::ostream&
overload of ostream inserter operator for cudaTask
template<typename P, typename C>
void cuda_single_task(P&& p, C c)
runs a callable asynchronously using one kernel thread
template<typename P, typename I, typename C>
void cuda_for_each(P&& p, I first, I last, C c)
performs asynchronous parallel iterations over a range of items
template<typename P, typename I, typename C>
void cuda_for_each_index(P&& p, I first, I last, I inc, C c)
performs asynchronous parallel iterations over an index-based range of items
template<typename P, typename I, typename O, typename C>
void cuda_transform(P&& p, I first, I last, O output, C op)
performs asynchronous parallel transforms over a range of items
template<typename P, typename I1, typename I2, typename O, typename C>
void cuda_transform(P&& p, I1 first1, I1 last1, I2 first2, O output, C op)
performs asynchronous parallel transforms over two ranges of items
template<typename P, typename I, typename T, typename O>
void cuda_reduce(P&& p, I first, I last, T* res, O op, void* buf)
performs asynchronous parallel reduction over a range of items
template<typename P, typename I, typename T, typename O>
void cuda_uninitialized_reduce(P&& p, I first, I last, T* res, O op, void* buf)
performs asynchronous parallel reduction over a range of items without an initial value
template<typename P, typename I, typename T, typename O, typename U>
void cuda_transform_reduce(P&& p, I first, I last, T* res, O bop, U uop, void* buf)
performs asynchronous parallel reduction over a range of transformed items without an initial value
template<typename P, typename I, typename T, typename O, typename U>
void cuda_uninitialized_transform_reduce(P&& p, I first, I last, T* res, O bop, U uop, void* buf)
performs asynchronous parallel reduction over a range of transformed items with an initial value
template<typename P, typename I, typename O, typename C>
void cuda_inclusive_scan(P&& p, I first, I last, O output, C op, void* buf)
performs asynchronous inclusive scan over a range of items
template<typename P, typename I, typename O, typename C, typename U>
void cuda_transform_inclusive_scan(P&& p, I first, I last, O output, C bop, U uop, void* buf)
performs asynchronous inclusive scan over a range of transformed items
template<typename P, typename I, typename O, typename C>
void cuda_exclusive_scan(P&& p, I first, I last, O output, C op, void* buf)
performs asynchronous exclusive scan over a range of items
template<typename P, typename I, typename O, typename C, typename U>
void cuda_transform_exclusive_scan(P&& p, I first, I last, O output, C bop, U uop, void* buf)
performs asynchronous exclusive scan over a range of items
template<typename P, typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>
void cuda_merge_by_key(P&& p, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp, void* buf)
performs asynchronous key-value merge over a range of keys and values
template<typename P, typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C>
void cuda_merge(P&& p, a_keys_it a_keys_first, a_keys_it a_keys_last, b_keys_it b_keys_first, b_keys_it b_keys_last, c_keys_it c_keys_first, C comp, void* buf)
performs asynchronous key-only merge over a range of keys
template<typename P, typename K, typename V = cudaEmpty>
auto cuda_sort_buffer_size(unsigned count) -> unsigned
queries the buffer size in bytes needed to call sort kernels for the given number of elements
template<typename P, typename K_it, typename V_it, typename C>
void cuda_sort_by_key(P&& p, K_it k_first, K_it k_last, V_it v_first, C comp, void* buf)
performs asynchronous key-value sort on a range of items
template<typename P, typename K_it, typename C>
void cuda_sort(P&& p, K_it k_first, K_it k_last, C comp, void* buf)
performs asynchronous key-only sort on a range of items
template<typename P, typename I, typename U>
void cuda_find_if(P&& p, I first, I last, unsigned* idx, U op)
finds the index of the first element that satisfies the given criteria
template<typename P, typename I, typename O>
void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf)
finds the index of the minimum element in a range
template<typename P, typename I, typename O>
void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf)
finds the index of the maximum element in a range
auto version() -> const char* constexpr
queries the version information in a string format major.minor.patch

Variables

template<typename P>
bool is_task_params_v constexpr
determines if the given type is a task parameter type
std::array<TaskType, 6> TASK_TYPES constexpr
array of all task types (used for iterating task types)
template<typename C>
bool is_subflow_task_v constexpr
determines if a callable is a dynamic task
template<typename C>
bool is_condition_task_v constexpr
determines if a callable is a condition task
template<typename C>
bool is_multi_condition_task_v constexpr
determines if a callable is a multi-condition task
template<typename C>
bool is_static_task_v constexpr
determines if a callable is a static task
template<typename P>
bool is_partitioner_v constexpr
determines if a type is a partitioner

Enum documentation

enum class tf::TaskType: int

enumeration of all task types

Enumerators
PLACEHOLDER

placeholder task type

STATIC

static task type

SUBFLOW

dynamic (subflow) task type

CONDITION

condition task type

MODULE

module task type

ASYNC

asynchronous task type

UNDEFINED

undefined task type (for internal use only)

enum class tf::ObserverType: int

enumeration of all observer types

enum class tf::PartitionerType: int

enumeration of all partitioner types

Enumerators
STATIC

static partitioner type

DYNAMIC

dynamic partitioner type

enum class tf::PipeType: int

enumeration of all pipe types

Enumerators
PARALLEL

parallel type

SERIAL

serial type

enum class tf::cudaTaskType: int

enumeration of all cudaTask types

Enumerators
EMPTY

empty task type

HOST

host task type

MEMSET

memory set task type

MEMCPY

memory copy task type

KERNEL

memory copy task type

SUBFLOW

subflow (child graph) task type

CAPTURE

capture task type

UNDEFINED

undefined task type

Typedef documentation

using tf::observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>

default time point type of observers

using tf::DefaultPartitioner = GuidedPartitioner<>

default partitioner set to tf::GuidedPartitioner

Guided partitioning algorithm can achieve stable and decent performance for most parallel algorithms.

using tf::cudaDefaultExecutionPolicy = cudaExecutionPolicy<512, 7>

default execution policy

Function documentation

const char* tf::to_string(TaskType type)

convert a task type to a human-readable string

The name of each task type is the litte-case string of its characters.

TaskType::PLACEHOLDER     ->  "placeholder"
TaskType::STATIC          ->  "static"
TaskType::SUBFLOW         ->  "subflow"
TaskType::CONDITION       ->  "condition"
TaskType::MODULE          ->  "module"
TaskType::ASYNC           ->  "async"

std::ostream& tf::operator<<(std::ostream& os, const Task& task)

overload of ostream inserter operator for Task

template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>
bool tf::try_acquire(I first, I last)

tries to acquire all semaphores in the specified range

Template parameters
I iterator type
Parameters
first iterator to the beginning (inclusive)
last iterator to the end (exclusive)
Returns true if all semaphores are acquired, otherwise false

Tries to acquire all semaphores in the specified range.

template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>
bool tf::try_acquire(S && ... semaphores)

tries to acquire all semaphores

Parameters
semaphores semaphores to acquire
Returns true if all semaphores are acquired, otherwise false

Tries to acquire all the semaphores.

template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>
void tf::release(I first, I last)

tries to acquire all semaphores in the specified range

Template parameters
I iterator type
Parameters
first iterator to the beginning (inclusive)
last iterator to the end (exclusive)

Releases all the semaphores in the given range.

template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>
void tf::release(S && ... semaphores)

tries to acquire all semaphores

Parameters
semaphores semaphores to release

Releases all the semaphores.

const char* tf::to_string(ObserverType type)

convert an observer type to a human-readable string

template<typename Input, typename Output, typename C>
auto tf::make_data_pipe(PipeType d, C&& callable)

function to construct a data pipe (tf::DataPipe)

Template parameters
Input input data type
Output output data type
C callable type

tf::make_data_pipe is a helper function to create a data pipe (tf::DataPipe) in a data-parallel pipeline (tf::DataPipeline). The first argument specifies the direction of the data pipe, either tf::PipeType::SERIAL or tf::PipeType::PARALLEL, and the second argument is a callable to invoke by the pipeline scheduler. Input and output data types are specified via template parameters, which will always be decayed by the library to its original form for storage purpose. The callable must take the input data type in its first argument and returns a value of the output data type.

tf::make_data_pipe<int, std::string>(
  tf::PipeType::SERIAL, 
  [](int& input) {
    return std::to_string(input + 100);
  }
);

The callable can additionally take a reference of tf::Pipeflow, which allows you to query the runtime information of a stage task, such as its line number and token number.

tf::make_data_pipe<int, std::string>(
  tf::PipeType::SERIAL, 
  [](int& input, tf::Pipeflow& pf) {
    printf("token=%lu, line=%lu\n", pf.token(), pf.line());
    return std::to_string(input + 100);
  }
);

size_t tf::cuda_get_num_devices()

queries the number of available devices

int tf::cuda_get_device()

gets the current device associated with the caller thread

void tf::cuda_set_device(int id)

switches to a given device context

void tf::cuda_get_device_property(int i, cudaDeviceProp& p)

obtains the device property

cudaDeviceProp tf::cuda_get_device_property(int i)

obtains the device property

void tf::cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p)

dumps the device property

size_t tf::cuda_get_device_max_threads_per_block(int d)

queries the maximum threads per block on a device

size_t tf::cuda_get_device_max_x_dim_per_block(int d)

queries the maximum x-dimension per block on a device

size_t tf::cuda_get_device_max_y_dim_per_block(int d)

queries the maximum y-dimension per block on a device

size_t tf::cuda_get_device_max_z_dim_per_block(int d)

queries the maximum z-dimension per block on a device

size_t tf::cuda_get_device_max_x_dim_per_grid(int d)

queries the maximum x-dimension per grid on a device

size_t tf::cuda_get_device_max_y_dim_per_grid(int d)

queries the maximum y-dimension per grid on a device

size_t tf::cuda_get_device_max_z_dim_per_grid(int d)

queries the maximum z-dimension per grid on a device

size_t tf::cuda_get_device_max_shm_per_block(int d)

queries the maximum shared memory size in bytes per block on a device

size_t tf::cuda_get_device_warp_size(int d)

queries the warp size on a device

int tf::cuda_get_device_compute_capability_major(int d)

queries the major number of compute capability of a device

int tf::cuda_get_device_compute_capability_minor(int d)

queries the minor number of compute capability of a device

bool tf::cuda_get_device_unified_addressing(int d)

queries if the device supports unified addressing

int tf::cuda_get_driver_version()

queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver

int tf::cuda_get_runtime_version()

queries the CUDA Runtime version (1000 * major + 10 * minor)

size_t tf::cuda_get_free_mem(int d)

queries the free memory (expensive call)

size_t tf::cuda_get_total_mem(int d)

queries the total available memory (expensive call)

template<typename T>
T* tf::cuda_malloc_device(size_t N, int d)

allocates memory on the given device for holding N elements of type T

The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory.

template<typename T>
T* tf::cuda_malloc_device(size_t N)

allocates memory on the current device associated with the caller

The function calls malloc_device from the current device associated with the caller.

template<typename T>
T* tf::cuda_malloc_shared(size_t N)

allocates shared memory for holding N elements of type T

The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory.

template<typename T>
void tf::cuda_free(T* ptr, int d)

frees memory on the GPU device

Template parameters
T pointer type
Parameters
ptr device pointer to memory to free
d device context identifier

This methods call cudaFree to free the memory space pointed to by ptr using the given device context.

template<typename T>
void tf::cuda_free(T* ptr)

frees memory on the GPU device

Template parameters
T pointer type
Parameters
ptr device pointer to memory to free

This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller.

void tf::cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)

copies data between host and device asynchronously through a stream

Parameters
stream stream identifier
dst destination memory address
src source memory address
count size in bytes to copy

The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap.

void tf::cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)

initializes or sets GPU memory to the given value byte by byte

Parameters
stream stream identifier
devPtr pointer to GPU memory
value value to set for each byte of the specified memory
count size in bytes to set

The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

const char* tf::to_string(cudaTaskType type) constexpr

convert a cuda_task type to a human-readable string

std::ostream& tf::operator<<(std::ostream& os, const cudaTask& ct)

overload of ostream inserter operator for cudaTask

template<typename P, typename C>
void tf::cuda_single_task(P&& p, C c)

runs a callable asynchronously using one kernel thread

Template parameters
P execution policy type
C closure type
Parameters
p execution policy
c closure to run by one kernel thread

The function launches a single kernel thread to run the given callable through the stream in the execution policy object.

template<typename P, typename I, typename C>
void tf::cuda_for_each(P&& p, I first, I last, C c)

performs asynchronous parallel iterations over a range of items

Template parameters
P execution policy type
I input iterator type
C unary operator type
Parameters
p execution policy object
first iterator to the beginning of the range
last iterator to the end of the range
c unary operator to apply to each dereferenced iterator

This function is equivalent to a parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; itr++) {
  c(*itr);
}

template<typename P, typename I, typename C>
void tf::cuda_for_each_index(P&& p, I first, I last, I inc, C c)

performs asynchronous parallel iterations over an index-based range of items

Template parameters
P execution policy type
I input index type
C unary operator type
Parameters
p execution policy object
first index to the beginning of the range
last index to the end of the range
inc step size between successive iterations
c unary operator to apply to each index

This function is equivalent to a parallel execution of the following loop on a GPU:

// step is positive [first, last)
for(auto i=first; i<last; i+=step) {
  c(i);
}

// step is negative [first, last)
for(auto i=first; i>last; i+=step) {
  c(i);
}

template<typename P, typename I, typename O, typename C>
void tf::cuda_transform(P&& p, I first, I last, O output, C op)

performs asynchronous parallel transforms over a range of items

Template parameters
P execution policy type
I input iterator type
O output iterator type
C unary operator type
Parameters
p execution policy
first iterator to the beginning of the range
last iterator to the end of the range
output iterator to the beginning of the output range
op unary operator to apply to transform each item

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *output++ = op(*first++);
}

template<typename P, typename I1, typename I2, typename O, typename C>
void tf::cuda_transform(P&& p, I1 first1, I1 last1, I2 first2, O output, C op)

performs asynchronous parallel transforms over two ranges of items

Template parameters
P execution policy type
I1 first input iterator type
I2 second input iterator type
O output iterator type
C binary operator type
Parameters
p execution policy
first1 iterator to the beginning of the first range
last1 iterator to the end of the first range
first2 iterator to the beginning of the second range
output iterator to the beginning of the output range
op binary operator to apply to transform each pair of items

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {
  *output++ = op(*first1++, *first2++);
}

template<typename P, typename I, typename T, typename O>
void tf::cuda_reduce(P&& p, I first, I last, T* res, O op, void* buf)

performs asynchronous parallel reduction over a range of items

Template parameters
P execution policy type
I input iterator type
T value type
O binary operator type
Parameters
p execution policy
first iterator to the beginning of the range
last iterator to the end of the range
res pointer to the result
op binary operator to apply to reduce elements
buf pointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *result = op(*result, *first++);
}

template<typename P, typename I, typename T, typename O>
void tf::cuda_uninitialized_reduce(P&& p, I first, I last, T* res, O op, void* buf)

performs asynchronous parallel reduction over a range of items without an initial value

Template parameters
P execution policy type
I input iterator type
T value type
O binary operator type
Parameters
p execution policy
first iterator to the beginning of the range
last iterator to the end of the range
res pointer to the result
op binary operator to apply to reduce elements
buf pointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = *first++;  // no initial values partitipcate in the loop
while (first != last) {
  *result = op(*result, *first++);
}

template<typename P, typename I, typename T, typename O, typename U>
void tf::cuda_transform_reduce(P&& p, I first, I last, T* res, O bop, U uop, void* buf)

performs asynchronous parallel reduction over a range of transformed items without an initial value

Template parameters
P execution policy type
I input iterator type
T value type
O binary operator type
U unary operator type
Parameters
p execution policy
first iterator to the beginning of the range
last iterator to the end of the range
res pointer to the result
bop binary operator to apply to reduce elements
uop unary operator to apply to transform elements
buf pointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *result = bop(*result, uop(*first++));
}

template<typename P, typename I, typename T, typename O, typename U>
void tf::cuda_uninitialized_transform_reduce(P&& p, I first, I last, T* res, O bop, U uop, void* buf)

performs asynchronous parallel reduction over a range of transformed items with an initial value

Template parameters
P execution policy type
I input iterator type
T value type
O binary operator type
U unary operator type
Parameters
p execution policy
first iterator to the beginning of the range
last iterator to the end of the range
res pointer to the result
bop binary operator to apply to reduce elements
uop unary operator to apply to transform elements
buf pointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = uop(*first++);  // no initial values partitipcate in the loop
while (first != last) {
  *result = bop(*result, uop(*first++));
}

template<typename P, typename I, typename O, typename C>
void tf::cuda_inclusive_scan(P&& p, I first, I last, O output, C op, void* buf)

performs asynchronous inclusive scan over a range of items

Template parameters
P execution policy type
I input iterator
O output iterator
C binary operator type
Parameters
p execution policy
first iterator to the beginning of the input range
last iterator to the end of the input range
output iterator to the beginning of the output range
op binary operator to apply to scan
buf pointer to the temporary buffer

template<typename P, typename I, typename O, typename C, typename U>
void tf::cuda_transform_inclusive_scan(P&& p, I first, I last, O output, C bop, U uop, void* buf)

performs asynchronous inclusive scan over a range of transformed items

Template parameters
P execution policy type
I input iterator
O output iterator
C binary operator type
U unary operator type
Parameters
p execution policy
first iterator to the beginning of the input range
last iterator to the end of the input range
output iterator to the beginning of the output range
bop binary operator to apply to scan
uop unary operator to apply to transform each item before scan
buf pointer to the temporary buffer

template<typename P, typename I, typename O, typename C>
void tf::cuda_exclusive_scan(P&& p, I first, I last, O output, C op, void* buf)

performs asynchronous exclusive scan over a range of items

Template parameters
P execution policy type
I input iterator
O output iterator
C binary operator type
Parameters
p execution policy
first iterator to the beginning of the input range
last iterator to the end of the input range
output iterator to the beginning of the output range
op binary operator to apply to scan
buf pointer to the temporary buffer

template<typename P, typename I, typename O, typename C, typename U>
void tf::cuda_transform_exclusive_scan(P&& p, I first, I last, O output, C bop, U uop, void* buf)

performs asynchronous exclusive scan over a range of items

Template parameters
P execution policy type
I input iterator
O output iterator
C binary operator type
U unary operator type
Parameters
p execution policy
first iterator to the beginning of the input range
last iterator to the end of the input range
output iterator to the beginning of the output range
bop binary operator to apply to scan
uop unary operator to apply to transform each item before scan
buf pointer to the temporary buffer

template<typename P, typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>
void tf::cuda_merge_by_key(P&& p, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp, void* buf)

performs asynchronous key-value merge over a range of keys and values

Template parameters
P execution policy type
a_keys_it first key iterator type
a_vals_it first value iterator type
b_keys_it second key iterator type
b_vals_it second value iterator type
c_keys_it output key iterator type
c_vals_it output value iterator type
C comparator type
Parameters
p execution policy
a_keys_first iterator to the beginning of the first key range
a_keys_last iterator to the end of the first key range
a_vals_first iterator to the beginning of the first value range
b_keys_first iterator to the beginning of the second key range
b_keys_last iterator to the end of the second key range
b_vals_first iterator to the beginning of the second value range
c_keys_first iterator to the beginning of the output key range
c_vals_first iterator to the beginning of the output value range
comp comparator
buf pointer to the temporary buffer

Performs a key-value merge that copies elements from [a_keys_first, a_keys_last) and [b_keys_first, b_keys_last) into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending key order.

At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first)) and [b_vals_first + (b_keys_last - b_keys_first)) into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending order implied by each input element's associated key.

For example, assume:

  • a_keys = {1, 8};
  • a_vals = {2, 1};
  • b_keys = {3, 7};
  • b_vals = {3, 4};

After the merge, we have:

  • c_keys = {1, 3, 7, 8}
  • c_vals = {2, 3, 4, 1}

template<typename P, typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C>
void tf::cuda_merge(P&& p, a_keys_it a_keys_first, a_keys_it a_keys_last, b_keys_it b_keys_first, b_keys_it b_keys_last, c_keys_it c_keys_first, C comp, void* buf)

performs asynchronous key-only merge over a range of keys

Template parameters
P execution policy type
a_keys_it first key iterator type
b_keys_it second key iterator type
c_keys_it output key iterator type
C comparator type
Parameters
p execution policy
a_keys_first iterator to the beginning of the first key range
a_keys_last iterator to the end of the first key range
b_keys_first iterator to the beginning of the second key range
b_keys_last iterator to the end of the second key range
c_keys_first iterator to the beginning of the output key range
comp comparator
buf pointer to the temporary buffer

This function is equivalent to tf::cuda_merge_by_key without values.

template<typename P, typename K, typename V = cudaEmpty>
unsigned tf::cuda_sort_buffer_size(unsigned count)

queries the buffer size in bytes needed to call sort kernels for the given number of elements

Template parameters
P execution policy type
K key type
V value type (default tf::cudaEmpty)
Parameters
count number of keys/values to sort

The function is used to allocate a buffer for calling tf::cuda_sort.

template<typename P, typename K_it, typename V_it, typename C>
void tf::cuda_sort_by_key(P&& p, K_it k_first, K_it k_last, V_it v_first, C comp, void* buf)

performs asynchronous key-value sort on a range of items

Template parameters
P execution policy type
K_it key iterator type
V_it value iterator type
C comparator type
Parameters
p execution policy
k_first iterator to the beginning of the key range
k_last iterator to the end of the key range
v_first iterator to the beginning of the value range
comp binary comparator
buf pointer to the temporary buffer

Sorts key-value elements in [k_first, k_last) and [v_first, v_first + (k_last - k_first)) into ascending key order using the given comparator comp. If i and j are any two valid iterators in [k_first, k_last) such that i precedes j, and p and q are iterators in [v_first, v_first + (k_last - k_first)) corresponding to i and j respectively, then comp(*j, *i) evaluates to false.

For example, assume:

  • keys are {1, 4, 2, 8, 5, 7}
  • values are {'a', 'b', 'c', 'd', 'e', 'f'}

After sort:

  • keys are {1, 2, 4, 5, 7, 8}
  • values are {'a', 'c', 'b', 'e', 'f', 'd'}

template<typename P, typename K_it, typename C>
void tf::cuda_sort(P&& p, K_it k_first, K_it k_last, C comp, void* buf)

performs asynchronous key-only sort on a range of items

Template parameters
P execution policy type
K_it key iterator type
C comparator type
Parameters
p execution policy
k_first iterator to the beginning of the key range
k_last iterator to the end of the key range
comp binary comparator
buf pointer to the temporary buffer

This method is equivalent to tf::cuda_sort_by_key without values.

template<typename P, typename I, typename U>
void tf::cuda_find_if(P&& p, I first, I last, unsigned* idx, U op)

finds the index of the first element that satisfies the given criteria

Template parameters
P execution policy type
I input iterator type
U unary operator type
Parameters
p execution policy
first iterator to the beginning of the range
last iterator to the end of the range
idx pointer to the index of the found element
op unary operator which returns true for the required element

The function launches kernels asynchronously to find the index idx of the first element in the range [first, last) such that op(*(first+idx)) is true. This is equivalent to the parallel execution of the following loop:

unsigned idx = 0;
for(; first != last; ++first, ++idx) {
  if (p(*first)) {
    return idx;
  }
}
return idx;

template<typename P, typename I, typename O>
void tf::cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf)

finds the index of the minimum element in a range

Template parameters
P execution policy type
I input iterator type
O comparator type
Parameters
p execution policy object
first iterator to the beginning of the range
last iterator to the end of the range
idx solution index of the minimum element
op comparison function object
buf pointer to the buffer

The function launches kernels asynchronously to find the smallest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_min_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
  return 0;
}
auto smallest = first;
for (++first; first != last; ++first) {
  if (op(*first, *smallest)) {
    smallest = first;
  }
}
return std::distance(first, smallest);

template<typename P, typename I, typename O>
void tf::cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf)

finds the index of the maximum element in a range

Template parameters
P execution policy type
I input iterator type
O comparator type
Parameters
p execution policy object
first iterator to the beginning of the range
last iterator to the end of the range
idx solution index of the maximum element
op comparison function object
buf pointer to the buffer

The function launches kernels asynchronously to find the largest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_max_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
  return 0;
}
auto largest = first;
for (++first; first != last; ++first) {
  if (op(*largest, *first)) {
    largest = first;
  }
}
return std::distance(first, largest);

const char* tf::version() constexpr

queries the version information in a string format major.minor.patch

Release notes are available here: https://taskflow.github.io/taskflow/Releases.html

Variable documentation

template<typename P>
bool tf::is_task_params_v constexpr

determines if the given type is a task parameter type

Task parameters can be specified in one of the following types:

std::array<TaskType, 6> tf::TASK_TYPES constexpr

array of all task types (used for iterating task types)

template<typename C>
bool tf::is_subflow_task_v constexpr

determines if a callable is a dynamic task

A dynamic task is a callable object constructible from std::function<void(Subflow&)>.

template<typename C>
bool tf::is_condition_task_v constexpr

determines if a callable is a condition task

A condition task is a callable object constructible from std::function<int()> or std::function<int(tf::Runtime&)>.

template<typename C>
bool tf::is_multi_condition_task_v constexpr

determines if a callable is a multi-condition task

A multi-condition task is a callable object constructible from std::function<tf::SmallVector<int>()> or std::function<tf::SmallVector<int>(tf::Runtime&)>.

template<typename C>
bool tf::is_static_task_v constexpr

determines if a callable is a static task

A static task is a callable object constructible from std::function<void()> or std::function<void(tf::Runtime&)>.

template<typename P>
bool tf::is_partitioner_v constexpr

determines if a type is a partitioner

A partitioner is a derived type from tf::PartitionerBase.