tf namespace

taskflow namespace

Classes

class AsyncTask
class to create a dependent asynchronous task (async task)
template<typename T, size_t LogSize = TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE>
class BoundedTaskQueue
class to create a lock-free bounded single-producer multiple-consumer queue
template<typename T>
struct CachelineAligned
struct to ensure cacheline-aligned storage for an object.
class ChromeObserver
class to create an observer based on Chrome tracing format
template<typename T>
class cudaDeviceAllocator
class to create a CUDA device allocator
class cudaEvent
class to create an RAII-styled wrapper over a native CUDA event
template<unsigned NT, unsigned VT>
class cudaExecutionPolicy
class to define execution policy for CUDA standard algorithms
class cudaFlow
class to create a cudaFlow task dependency graph
class cudaFlowCapturer
class to create a cudaFlow graph using stream capture
class cudaFlowLinearOptimizer
class to capture a linear CUDA graph using a sequential stream
class cudaFlowRoundRobinOptimizer
class to capture a CUDA graph using a round-robin algorithm
class cudaFlowSequentialOptimizer
class to capture a CUDA graph using a sequential stream
class cudaScopedDevice
class to create an RAII-styled context switch
class cudaStream
class to create an RAII-styled wrapper over a native CUDA stream
class cudaTask
class to create a task handle over an internal node of a cudaFlow graph
template<typename T>
class cudaUSMAllocator
class to create a unified shared memory (USM) allocator
template<typename Input, typename Output, typename C>
class DataPipe
class to create a stage in a data-parallel pipeline
template<typename... Ps>
class DataPipeline
class to create a data-parallel pipeline scheduling framework
struct DefaultClosureWrapper
default closure wrapper that simply runs the given closure as is
struct DefaultTaskParams
empty task parameter type for compile-time optimization
template<typename C = DefaultClosureWrapper>
class DynamicPartitioner
class to construct a dynamic partitioner for scheduling parallel algorithms
class Executor
class to create an executor
class FlowBuilder
class to build a task dependency graph
template<typename T>
class Future
class to access the result of an execution
class Graph
class to create a graph object
template<typename C = DefaultClosureWrapper>
class GuidedPartitioner
class to construct a guided partitioner for scheduling parallel algorithms
class ObserverInterface
class to derive an executor observer
template<typename C = DefaultClosureWrapper>
class PartitionerBase
class to derive a partitioner for scheduling parallel algorithms
template<typename C = std::function<void(tf::Pipeflow&)>>
class Pipe
class to create a pipe object for a pipeline stage
class Pipeflow
class to create a pipeflow object used by the pipe callable
template<typename... Ps>
class Pipeline
class to create a pipeline scheduling framework
template<typename C = DefaultClosureWrapper>
class RandomPartitioner
class to construct a random partitioner for scheduling parallel algorithms
class Runtime
class to include a runtime object in a task
template<typename P>
class ScalablePipeline
class to create a scalable pipeline object
class Semaphore
class to create a semophore object for building a concurrency constraint
template<typename T, unsigned N = 2>
class SmallVector
class to define a vector optimized for small array
template<typename C = DefaultClosureWrapper>
class StaticPartitioner
class to construct a static partitioner for scheduling parallel algorithms
class Subflow
class to construct a subflow graph from the execution of a dynamic task
class Task
class to create a task handle over a node in a taskflow graph
class Taskflow
class to create a taskflow object
struct TaskParams
task parameters to use when creating an asynchronous task
class TaskView
class to access task information from the observer interface
class TFProfObserver
class to create an observer based on the built-in taskflow profiler format
template<typename T>
class UnboundedTaskQueue
class to create a lock-free unbounded single-producer multiple-consumer queue
class Worker
class to create a worker in an executor
class WorkerView
class to create an immutable view of a worker in an executor

Enums

enum class TaskType: int { PLACEHOLDER = 0, STATIC, SUBFLOW, CONDITION, MODULE, ASYNC, UNDEFINED }
enumeration of all task types
enum class ObserverType: int { TFPROF = 0, CHROME, UNDEFINED }
enumeration of all observer types
enum class PartitionerType: int { STATIC, DYNAMIC }
enumeration of all partitioner types
enum class PipeType: int { PARALLEL = 1, SERIAL = 2 }
enumeration of all pipe types
enum class cudaTaskType: int { EMPTY = 0, HOST, MEMSET, MEMCPY, KERNEL, SUBFLOW, CAPTURE, UNDEFINED }
enumeration of all cudaTask types

Typedefs

using observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>
default time point type of observers
using DefaultPartitioner = GuidedPartitioner<>
default partitioner set to tf::GuidedPartitioner
using cudaDefaultExecutionPolicy = cudaExecutionPolicy<512, 7>
default execution policy

Functions

template<typename T, std::enable_if_t<(std::is_unsigned_v<std::decay_t<T>> && sizeof(T)==8), void>* = nullptr>
auto next_pow2(T x) -> T constexpr
rounds the given 64-bit unsigned integer to the nearest power of 2
template<typename T, std::enable_if_t<std::is_integral_v<std::decay_t<T>>, void>* = nullptr>
auto is_pow2(const T& x) -> bool constexpr
checks if the given number is a power of 2
template<typename T>
auto log2(T n) -> int constexpr
Computes the floor of log2 of the given positive integer.
template<typename RandItr, typename C>
auto median_of_three(RandItr l, RandItr m, RandItr r, C cmp) -> RandItr
finds the median of three numbers pointed to by iterators using the given comparator
template<typename RandItr, typename C>
auto pseudo_median_of_nine(RandItr beg, RandItr end, C cmp) -> RandItr
finds the pseudo median of a range of items using a spread of nine numbers
template<typename Iter, typename Compare>
void sort2(Iter a, Iter b, Compare comp)
sorts two elements of dereferenced iterators using the given comparison function
template<typename Iter, typename Compare>
void sort3(Iter a, Iter b, Iter c, Compare comp)
Sorts three elements of dereferenced iterators using the given comparison function.
template<typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr>
auto unique_id() -> T
generates a program-wide unique ID of the given type in a thread-safe manner
template<typename T>
void atomic_max(std::atomic<T>& v, const T& max_v) noexcept
updates an atomic variable with the maximum value
template<typename T>
void atomic_min(std::atomic<T>& v, const T& min_v) noexcept
updates an atomic variable with the minimum value
template<typename T>
auto seed() -> T noexcept
generates a random seed based on the current system clock
auto get_env(const std::string& str) -> std::string
retrieves the value of an environment variable
auto has_env(const std::string& str) -> bool
checks whether an environment variable is defined
void pause()
template<typename P>
void spin_until(P&& predicate)
spins until the given predicate becomes true
auto to_string(TaskType type) -> const char*
convert a task type to a human-readable string
auto operator<<(std::ostream& os, const Task& task) -> std::ostream&
overload of ostream inserter operator for Task
template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>
auto try_acquire(I first, I last) -> bool
tries to acquire all semaphores in the specified range
template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>
auto try_acquire(S && ... semaphores) -> bool
tries to acquire all semaphores
template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>
void release(I first, I last)
tries to acquire all semaphores in the specified range
template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>
void release(S && ... semaphores)
tries to acquire all semaphores
auto to_string(ObserverType type) -> const char*
convert an observer type to a human-readable string
template<typename Input, typename Output, typename C>
auto make_data_pipe(PipeType d, C&& callable) -> auto
function to construct a data pipe (tf::DataPipe)
auto cuda_get_num_devices() -> size_t
queries the number of available devices
auto cuda_get_device() -> int
gets the current device associated with the caller thread
void cuda_set_device(int id)
switches to a given device context
void cuda_get_device_property(int i, cudaDeviceProp& p)
obtains the device property
auto cuda_get_device_property(int i) -> cudaDeviceProp
obtains the device property
void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p)
dumps the device property
auto cuda_get_device_max_threads_per_block(int d) -> size_t
queries the maximum threads per block on a device
auto cuda_get_device_max_x_dim_per_block(int d) -> size_t
queries the maximum x-dimension per block on a device
auto cuda_get_device_max_y_dim_per_block(int d) -> size_t
queries the maximum y-dimension per block on a device
auto cuda_get_device_max_z_dim_per_block(int d) -> size_t
queries the maximum z-dimension per block on a device
auto cuda_get_device_max_x_dim_per_grid(int d) -> size_t
queries the maximum x-dimension per grid on a device
auto cuda_get_device_max_y_dim_per_grid(int d) -> size_t
queries the maximum y-dimension per grid on a device
auto cuda_get_device_max_z_dim_per_grid(int d) -> size_t
queries the maximum z-dimension per grid on a device
auto cuda_get_device_max_shm_per_block(int d) -> size_t
queries the maximum shared memory size in bytes per block on a device
auto cuda_get_device_warp_size(int d) -> size_t
queries the warp size on a device
auto cuda_get_device_compute_capability_major(int d) -> int
queries the major number of compute capability of a device
auto cuda_get_device_compute_capability_minor(int d) -> int
queries the minor number of compute capability of a device
auto cuda_get_device_unified_addressing(int d) -> bool
queries if the device supports unified addressing
auto cuda_get_driver_version() -> int
queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
auto cuda_get_runtime_version() -> int
queries the CUDA Runtime version (1000 * major + 10 * minor)
auto cuda_get_free_mem(int d) -> size_t
queries the free memory (expensive call)
auto cuda_get_total_mem(int d) -> size_t
queries the total available memory (expensive call)
template<typename T>
auto cuda_malloc_device(size_t N, int d) -> T*
allocates memory on the given device for holding N elements of type T
template<typename T>
auto cuda_malloc_device(size_t N) -> T*
allocates memory on the current device associated with the caller
template<typename T>
auto cuda_malloc_shared(size_t N) -> T*
allocates shared memory for holding N elements of type T
template<typename T>
void cuda_free(T* ptr, int d)
frees memory on the GPU device
template<typename T>
void cuda_free(T* ptr)
frees memory on the GPU device
void cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)
copies data between host and device asynchronously through a stream
void cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)
initializes or sets GPU memory to the given value byte by byte
auto to_string(cudaTaskType type) -> const char* constexpr
convert a cuda_task type to a human-readable string
auto operator<<(std::ostream& os, const cudaTask& ct) -> std::ostream&
overload of ostream inserter operator for cudaTask
template<typename P, typename C>
void cuda_single_task(P&& p, C c)
runs a callable asynchronously using one kernel thread
template<typename P, typename I, typename C>
void cuda_for_each(P&& p, I first, I last, C c)
performs asynchronous parallel iterations over a range of items
template<typename P, typename I, typename C>
void cuda_for_each_index(P&& p, I first, I last, I inc, C c)
performs asynchronous parallel iterations over an index-based range of items
template<typename P, typename I, typename O, typename C>
void cuda_transform(P&& p, I first, I last, O output, C op)
performs asynchronous parallel transforms over a range of items
template<typename P, typename I1, typename I2, typename O, typename C>
void cuda_transform(P&& p, I1 first1, I1 last1, I2 first2, O output, C op)
performs asynchronous parallel transforms over two ranges of items
template<typename P, typename I, typename T, typename O>
void cuda_reduce(P&& p, I first, I last, T* res, O op, void* buf)
performs asynchronous parallel reduction over a range of items
template<typename P, typename I, typename T, typename O>
void cuda_uninitialized_reduce(P&& p, I first, I last, T* res, O op, void* buf)
performs asynchronous parallel reduction over a range of items without an initial value
template<typename P, typename I, typename T, typename O, typename U>
void cuda_transform_reduce(P&& p, I first, I last, T* res, O bop, U uop, void* buf)
performs asynchronous parallel reduction over a range of transformed items without an initial value
template<typename P, typename I, typename T, typename O, typename U>
void cuda_uninitialized_transform_reduce(P&& p, I first, I last, T* res, O bop, U uop, void* buf)
performs asynchronous parallel reduction over a range of transformed items with an initial value
template<typename P, typename I, typename O, typename C>
void cuda_inclusive_scan(P&& p, I first, I last, O output, C op, void* buf)
performs asynchronous inclusive scan over a range of items
template<typename P, typename I, typename O, typename C, typename U>
void cuda_transform_inclusive_scan(P&& p, I first, I last, O output, C bop, U uop, void* buf)
performs asynchronous inclusive scan over a range of transformed items
template<typename P, typename I, typename O, typename C>
void cuda_exclusive_scan(P&& p, I first, I last, O output, C op, void* buf)
performs asynchronous exclusive scan over a range of items
template<typename P, typename I, typename O, typename C, typename U>
void cuda_transform_exclusive_scan(P&& p, I first, I last, O output, C bop, U uop, void* buf)
performs asynchronous exclusive scan over a range of items
template<typename P, typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>
void cuda_merge_by_key(P&& p, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp, void* buf)
performs asynchronous key-value merge over a range of keys and values
template<typename P, typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C>
void cuda_merge(P&& p, a_keys_it a_keys_first, a_keys_it a_keys_last, b_keys_it b_keys_first, b_keys_it b_keys_last, c_keys_it c_keys_first, C comp, void* buf)
performs asynchronous key-only merge over a range of keys
template<typename P, typename K, typename V = cudaEmpty>
auto cuda_sort_buffer_size(unsigned count) -> unsigned
queries the buffer size in bytes needed to call sort kernels for the given number of elements
template<typename P, typename K_it, typename V_it, typename C>
void cuda_sort_by_key(P&& p, K_it k_first, K_it k_last, V_it v_first, C comp, void* buf)
performs asynchronous key-value sort on a range of items
template<typename P, typename K_it, typename C>
void cuda_sort(P&& p, K_it k_first, K_it k_last, C comp, void* buf)
performs asynchronous key-only sort on a range of items
template<typename P, typename I, typename U>
void cuda_find_if(P&& p, I first, I last, unsigned* idx, U op)
finds the index of the first element that satisfies the given criteria
template<typename P, typename I, typename O>
void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf)
finds the index of the minimum element in a range
template<typename P, typename I, typename O>
void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf)
finds the index of the maximum element in a range
auto version() -> const char* constexpr
queries the version information in a string format major.minor.patch

Variables

template<typename P>
bool is_task_params_v constexpr
determines if the given type is a task parameter type
std::array<TaskType, 6> TASK_TYPES constexpr
array of all task types (used for iterating task types)
template<typename C>
bool is_subflow_task_v constexpr
determines if a callable is a dynamic task
template<typename C>
bool is_condition_task_v constexpr
determines if a callable is a condition task
template<typename C>
bool is_multi_condition_task_v constexpr
determines if a callable is a multi-condition task
template<typename C>
bool is_static_task_v constexpr
determines if a callable is a static task
template<typename P>
bool is_partitioner_v constexpr
determines if a type is a partitioner

Enum documentation

enum class tf::TaskType: int

enumeration of all task types

Enumerators
PLACEHOLDER

placeholder task type

STATIC

static task type

SUBFLOW

dynamic (subflow) task type

CONDITION

condition task type

MODULE

module task type

ASYNC

asynchronous task type

UNDEFINED

undefined task type (for internal use only)

enum class tf::ObserverType: int

enumeration of all observer types

enum class tf::PartitionerType: int

enumeration of all partitioner types

Enumerators
STATIC

static partitioner type

DYNAMIC

dynamic partitioner type

enum class tf::PipeType: int

enumeration of all pipe types

Enumerators
PARALLEL

parallel type

SERIAL

serial type

enum class tf::cudaTaskType: int

enumeration of all cudaTask types

Enumerators
EMPTY

empty task type

HOST

host task type

MEMSET

memory set task type

MEMCPY

memory copy task type

KERNEL

memory copy task type

SUBFLOW

subflow (child graph) task type

CAPTURE

capture task type

UNDEFINED

undefined task type

Typedef documentation

using tf::observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>

default time point type of observers

using tf::DefaultPartitioner = GuidedPartitioner<>

default partitioner set to tf::GuidedPartitioner

Guided partitioning algorithm can achieve stable and decent performance for most parallel algorithms.

using tf::cudaDefaultExecutionPolicy = cudaExecutionPolicy<512, 7>

default execution policy

Function documentation

template<typename T, std::enable_if_t<(std::is_unsigned_v<std::decay_t<T>> && sizeof(T)==8), void>* = nullptr>
T tf::next_pow2(T x) constexpr

rounds the given 64-bit unsigned integer to the nearest power of 2

rounds the given 32-bit unsigned integer to the nearest power of 2

template<typename T, std::enable_if_t<std::is_integral_v<std::decay_t<T>>, void>* = nullptr>
bool tf::is_pow2(const T& x) constexpr

checks if the given number is a power of 2

Template parameters
T The type of the input. Must be an integral type.
Parameters
x The integer to check.
Returns true if x is a power of 2, otherwise false.

This function determines if the given integer is a power of 2.

template<typename T>
int tf::log2(T n) constexpr

Computes the floor of log2 of the given positive integer.

Template parameters
T The type of the input. Must be an integral type.
Parameters
n The positive integer to compute log2 for. Assumes n > 0.
Returns The floor of log2 of n.

This function calculates the largest integer log such that 2^log <= n.

template<typename RandItr, typename C>
RandItr tf::median_of_three(RandItr l, RandItr m, RandItr r, C cmp)

finds the median of three numbers pointed to by iterators using the given comparator

Template parameters
RandItr The type of the random-access iterator.
C The type of the comparator.
Parameters
l Iterator to the first element.
m Iterator to the second element.
r Iterator to the third element.
cmp The comparator used to compare the dereferenced iterator values.
Returns The iterator pointing to the median value among the three elements.

This function determines the median value of the elements pointed to by three random-access iterators using the provided comparator.

template<typename RandItr, typename C>
RandItr tf::pseudo_median_of_nine(RandItr beg, RandItr end, C cmp)

finds the pseudo median of a range of items using a spread of nine numbers

Template parameters
RandItr The type of the random-access iterator.
C The type of the comparator.
Parameters
beg Iterator to the beginning of the range.
end Iterator to the end of the range.
cmp The comparator used to compare the dereferenced iterator values.
Returns The iterator pointing to the pseudo median of the range.

This function computes an approximate median of a range of items by sampling nine values spread across the range and finding their median. It uses a combination of the median_of_three function to determine the pseudo median.

template<typename Iter, typename Compare>
void tf::sort2(Iter a, Iter b, Compare comp)

sorts two elements of dereferenced iterators using the given comparison function

Template parameters
Iter The type of the iterator.
Compare The type of the comparator.
Parameters
a Iterator to the first element.
b Iterator to the second element.
comp The comparator used to compare the dereferenced iterator values.

This function compares two elements pointed to by iterators and swaps them if they are out of order according to the provided comparator.

template<typename Iter, typename Compare>
void tf::sort3(Iter a, Iter b, Iter c, Compare comp)

Sorts three elements of dereferenced iterators using the given comparison function.

Template parameters
Iter The type of the iterator.
Compare The type of the comparator.
Parameters
a Iterator to the first element.
b Iterator to the second element.
c Iterator to the third element.
comp The comparator used to compare the dereferenced iterator values.

This function sorts three elements pointed to by iterators in ascending order according to the provided comparator. The sorting is performed using a sequence of calls to the sort2 function to ensure the correct order of elements.

template<typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr>
T tf::unique_id()

generates a program-wide unique ID of the given type in a thread-safe manner

Template parameters
T The type of the ID to generate. Must be an integral type.
Returns A unique ID of type T.

This function provides a globally unique identifier of the specified integral type. It uses a static std::atomic counter to ensure thread safety and increments the counter in a relaxed memory ordering for efficiency.

template<typename T>
void tf::atomic_max(std::atomic<T>& v, const T& max_v) noexcept

updates an atomic variable with the maximum value

Template parameters
T The type of the atomic variable. Must be trivially copyable and comparable.
Parameters
v The atomic variable to update.
max_v The value to compare with the current value of v.

This function atomically updates the provided atomic variable v to hold the maximum of its current value and max_v. The update is performed using a relaxed memory ordering for efficiency in non-synchronizing contexts.

template<typename T>
void tf::atomic_min(std::atomic<T>& v, const T& min_v) noexcept

updates an atomic variable with the minimum value

Template parameters
T The type of the atomic variable. Must be trivially copyable and comparable.
Parameters
v The atomic variable to update.
min_v The value to compare with the current value of v.

This function atomically updates the provided atomic variable v to hold the minimum of its current value and min_v. The update is performed using a relaxed memory ordering for efficiency in non-synchronizing contexts.

template<typename T>
T tf::seed() noexcept

generates a random seed based on the current system clock

Template parameters
T The type of the returned seed. Must be an integral type.
Returns A seed value based on the system clock.

This function returns a seed value derived from the number of clock ticks since the epoch as measured by the system clock. The seed can be used to initialize random number generators.

std::string tf::get_env(const std::string& str)

retrieves the value of an environment variable

Parameters
str The name of the environment variable to retrieve.
Returns The value of the environment variable as a string, or an empty string if not found.

This function fetches the value of an environment variable by name. If the variable is not found, it returns an empty string.

bool tf::has_env(const std::string& str)

checks whether an environment variable is defined

Parameters
str The name of the environment variable to check.
Returns true if the environment variable exists, false otherwise.

This function determines if a specific environment variable exists in the current environment.

void tf::pause()

This function is used in spin-wait loops to hint the CPU that the current thread is in a busy-wait state. It helps reduce power consumption and improves performance on hyper-threaded processors by preventing the CPU from consuming unnecessary cycles while waiting. It is particularly useful in low-contention scenarios, where the thread is likely to quickly acquire the lock or condition it's waiting for, avoiding an expensive context switch. On modern x86 processors, this instruction can be invoked using __builtin_ia32_pause() in GCC/Clang or _mm_pause() in MSVC. In non-x86 architectures, alternative mechanisms such as yielding the CPU may be used instead.

template<typename P>
void tf::spin_until(P&& predicate)

spins until the given predicate becomes true

Template parameters
P the type of the predicate function or callable.
Parameters
predicate the callable that returns a boolean value, which is checked in the loop.

This function repeatedly checks the provided predicate in a spin-wait loop and uses a backoff strategy to minimize CPU waste during the wait. Initially, it uses the pause() instruction for the first 100 iterations to hint to the CPU that the thread is waiting, thus reducing power consumption and avoiding unnecessary cycles. After 100 iterations, it switches to yielding the CPU using std::this_thread::yield() to allow other threads to run and improve system responsiveness.

The function operates as follows:

  1. For the first 100 iterations, it invokes pause() to reduce power consumption during the spin-wait.
  2. After 100 iterations, it uses std::this_thread::yield() to relinquish the CPU, allowing other threads to execute.

const char* tf::to_string(TaskType type)

convert a task type to a human-readable string

The name of each task type is the litte-case string of its characters.

TaskType::PLACEHOLDER     ->  "placeholder"
TaskType::STATIC          ->  "static"
TaskType::SUBFLOW         ->  "subflow"
TaskType::CONDITION       ->  "condition"
TaskType::MODULE          ->  "module"
TaskType::ASYNC           ->  "async"

std::ostream& tf::operator<<(std::ostream& os, const Task& task)

overload of ostream inserter operator for Task

template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>
bool tf::try_acquire(I first, I last)

tries to acquire all semaphores in the specified range

Template parameters
I iterator type
Parameters
first iterator to the beginning (inclusive)
last iterator to the end (exclusive)
Returns true if all semaphores are acquired, otherwise false

Tries to acquire all semaphores in the specified range.

template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>
bool tf::try_acquire(S && ... semaphores)

tries to acquire all semaphores

Parameters
semaphores semaphores to acquire
Returns true if all semaphores are acquired, otherwise false

Tries to acquire all the semaphores.

template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>
void tf::release(I first, I last)

tries to acquire all semaphores in the specified range

Template parameters
I iterator type
Parameters
first iterator to the beginning (inclusive)
last iterator to the end (exclusive)

Releases all the semaphores in the given range.

template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>
void tf::release(S && ... semaphores)

tries to acquire all semaphores

Parameters
semaphores semaphores to release

Releases all the semaphores.

const char* tf::to_string(ObserverType type)

convert an observer type to a human-readable string

template<typename Input, typename Output, typename C>
auto tf::make_data_pipe(PipeType d, C&& callable)

function to construct a data pipe (tf::DataPipe)

Template parameters
Input input data type
Output output data type
C callable type

tf::make_data_pipe is a helper function to create a data pipe (tf::DataPipe) in a data-parallel pipeline (tf::DataPipeline). The first argument specifies the direction of the data pipe, either tf::PipeType::SERIAL or tf::PipeType::PARALLEL, and the second argument is a callable to invoke by the pipeline scheduler. Input and output data types are specified via template parameters, which will always be decayed by the library to its original form for storage purpose. The callable must take the input data type in its first argument and returns a value of the output data type.

tf::make_data_pipe<int, std::string>(
  tf::PipeType::SERIAL, 
  [](int& input) {
    return std::to_string(input + 100);
  }
);

The callable can additionally take a reference of tf::Pipeflow, which allows you to query the runtime information of a stage task, such as its line number and token number.

tf::make_data_pipe<int, std::string>(
  tf::PipeType::SERIAL, 
  [](int& input, tf::Pipeflow& pf) {
    printf("token=%lu, line=%lu\n", pf.token(), pf.line());
    return std::to_string(input + 100);
  }
);

size_t tf::cuda_get_num_devices()

queries the number of available devices

int tf::cuda_get_device()

gets the current device associated with the caller thread

void tf::cuda_set_device(int id)

switches to a given device context

void tf::cuda_get_device_property(int i, cudaDeviceProp& p)

obtains the device property

cudaDeviceProp tf::cuda_get_device_property(int i)

obtains the device property

void tf::cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p)

dumps the device property

size_t tf::cuda_get_device_max_threads_per_block(int d)

queries the maximum threads per block on a device

size_t tf::cuda_get_device_max_x_dim_per_block(int d)

queries the maximum x-dimension per block on a device

size_t tf::cuda_get_device_max_y_dim_per_block(int d)

queries the maximum y-dimension per block on a device

size_t tf::cuda_get_device_max_z_dim_per_block(int d)

queries the maximum z-dimension per block on a device

size_t tf::cuda_get_device_max_x_dim_per_grid(int d)

queries the maximum x-dimension per grid on a device

size_t tf::cuda_get_device_max_y_dim_per_grid(int d)

queries the maximum y-dimension per grid on a device

size_t tf::cuda_get_device_max_z_dim_per_grid(int d)

queries the maximum z-dimension per grid on a device

size_t tf::cuda_get_device_max_shm_per_block(int d)

queries the maximum shared memory size in bytes per block on a device

size_t tf::cuda_get_device_warp_size(int d)

queries the warp size on a device

int tf::cuda_get_device_compute_capability_major(int d)

queries the major number of compute capability of a device

int tf::cuda_get_device_compute_capability_minor(int d)

queries the minor number of compute capability of a device

bool tf::cuda_get_device_unified_addressing(int d)

queries if the device supports unified addressing

int tf::cuda_get_driver_version()

queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver

int tf::cuda_get_runtime_version()

queries the CUDA Runtime version (1000 * major + 10 * minor)

size_t tf::cuda_get_free_mem(int d)

queries the free memory (expensive call)

size_t tf::cuda_get_total_mem(int d)

queries the total available memory (expensive call)

template<typename T>
T* tf::cuda_malloc_device(size_t N, int d)

allocates memory on the given device for holding N elements of type T

The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory.

template<typename T>
T* tf::cuda_malloc_device(size_t N)

allocates memory on the current device associated with the caller

The function calls malloc_device from the current device associated with the caller.

template<typename T>
T* tf::cuda_malloc_shared(size_t N)

allocates shared memory for holding N elements of type T

The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory.

template<typename T>
void tf::cuda_free(T* ptr, int d)

frees memory on the GPU device

Template parameters
T pointer type
Parameters
ptr device pointer to memory to free
d device context identifier

This methods call cudaFree to free the memory space pointed to by ptr using the given device context.

template<typename T>
void tf::cuda_free(T* ptr)

frees memory on the GPU device

Template parameters
T pointer type
Parameters
ptr device pointer to memory to free

This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller.

void tf::cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)

copies data between host and device asynchronously through a stream

Parameters
stream stream identifier
dst destination memory address
src source memory address
count size in bytes to copy

The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap.

void tf::cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)

initializes or sets GPU memory to the given value byte by byte

Parameters
stream stream identifier
devPtr pointer to GPU memory
value value to set for each byte of the specified memory
count size in bytes to set

The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

const char* tf::to_string(cudaTaskType type) constexpr

convert a cuda_task type to a human-readable string

std::ostream& tf::operator<<(std::ostream& os, const cudaTask& ct)

overload of ostream inserter operator for cudaTask

template<typename P, typename C>
void tf::cuda_single_task(P&& p, C c)

runs a callable asynchronously using one kernel thread

Template parameters
P execution policy type
C closure type
Parameters
p execution policy
c closure to run by one kernel thread

The function launches a single kernel thread to run the given callable through the stream in the execution policy object.

template<typename P, typename I, typename C>
void tf::cuda_for_each(P&& p, I first, I last, C c)

performs asynchronous parallel iterations over a range of items

Template parameters
P execution policy type
I input iterator type
C unary operator type
Parameters
p execution policy object
first iterator to the beginning of the range
last iterator to the end of the range
c unary operator to apply to each dereferenced iterator

This function is equivalent to a parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; itr++) {
  c(*itr);
}

template<typename P, typename I, typename C>
void tf::cuda_for_each_index(P&& p, I first, I last, I inc, C c)

performs asynchronous parallel iterations over an index-based range of items

Template parameters
P execution policy type
I input index type
C unary operator type
Parameters
p execution policy object
first index to the beginning of the range
last index to the end of the range
inc step size between successive iterations
c unary operator to apply to each index

This function is equivalent to a parallel execution of the following loop on a GPU:

// step is positive [first, last)
for(auto i=first; i<last; i+=step) {
  c(i);
}

// step is negative [first, last)
for(auto i=first; i>last; i+=step) {
  c(i);
}

template<typename P, typename I, typename O, typename C>
void tf::cuda_transform(P&& p, I first, I last, O output, C op)

performs asynchronous parallel transforms over a range of items

Template parameters
P execution policy type
I input iterator type
O output iterator type
C unary operator type
Parameters
p execution policy
first iterator to the beginning of the range
last iterator to the end of the range
output iterator to the beginning of the output range
op unary operator to apply to transform each item

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *output++ = op(*first++);
}

template<typename P, typename I1, typename I2, typename O, typename C>
void tf::cuda_transform(P&& p, I1 first1, I1 last1, I2 first2, O output, C op)

performs asynchronous parallel transforms over two ranges of items

Template parameters
P execution policy type
I1 first input iterator type
I2 second input iterator type
O output iterator type
C binary operator type
Parameters
p execution policy
first1 iterator to the beginning of the first range
last1 iterator to the end of the first range
first2 iterator to the beginning of the second range
output iterator to the beginning of the output range
op binary operator to apply to transform each pair of items

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {
  *output++ = op(*first1++, *first2++);
}

template<typename P, typename I, typename T, typename O>
void tf::cuda_reduce(P&& p, I first, I last, T* res, O op, void* buf)

performs asynchronous parallel reduction over a range of items

Template parameters
P execution policy type
I input iterator type
T value type
O binary operator type
Parameters
p execution policy
first iterator to the beginning of the range
last iterator to the end of the range
res pointer to the result
op binary operator to apply to reduce elements
buf pointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *result = op(*result, *first++);
}

template<typename P, typename I, typename T, typename O>
void tf::cuda_uninitialized_reduce(P&& p, I first, I last, T* res, O op, void* buf)

performs asynchronous parallel reduction over a range of items without an initial value

Template parameters
P execution policy type
I input iterator type
T value type
O binary operator type
Parameters
p execution policy
first iterator to the beginning of the range
last iterator to the end of the range
res pointer to the result
op binary operator to apply to reduce elements
buf pointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = *first++;  // no initial values partitipcate in the loop
while (first != last) {
  *result = op(*result, *first++);
}

template<typename P, typename I, typename T, typename O, typename U>
void tf::cuda_transform_reduce(P&& p, I first, I last, T* res, O bop, U uop, void* buf)

performs asynchronous parallel reduction over a range of transformed items without an initial value

Template parameters
P execution policy type
I input iterator type
T value type
O binary operator type
U unary operator type
Parameters
p execution policy
first iterator to the beginning of the range
last iterator to the end of the range
res pointer to the result
bop binary operator to apply to reduce elements
uop unary operator to apply to transform elements
buf pointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *result = bop(*result, uop(*first++));
}

template<typename P, typename I, typename T, typename O, typename U>
void tf::cuda_uninitialized_transform_reduce(P&& p, I first, I last, T* res, O bop, U uop, void* buf)

performs asynchronous parallel reduction over a range of transformed items with an initial value

Template parameters
P execution policy type
I input iterator type
T value type
O binary operator type
U unary operator type
Parameters
p execution policy
first iterator to the beginning of the range
last iterator to the end of the range
res pointer to the result
bop binary operator to apply to reduce elements
uop unary operator to apply to transform elements
buf pointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = uop(*first++);  // no initial values partitipcate in the loop
while (first != last) {
  *result = bop(*result, uop(*first++));
}

template<typename P, typename I, typename O, typename C>
void tf::cuda_inclusive_scan(P&& p, I first, I last, O output, C op, void* buf)

performs asynchronous inclusive scan over a range of items

Template parameters
P execution policy type
I input iterator
O output iterator
C binary operator type
Parameters
p execution policy
first iterator to the beginning of the input range
last iterator to the end of the input range
output iterator to the beginning of the output range
op binary operator to apply to scan
buf pointer to the temporary buffer

template<typename P, typename I, typename O, typename C, typename U>
void tf::cuda_transform_inclusive_scan(P&& p, I first, I last, O output, C bop, U uop, void* buf)

performs asynchronous inclusive scan over a range of transformed items

Template parameters
P execution policy type
I input iterator
O output iterator
C binary operator type
U unary operator type
Parameters
p execution policy
first iterator to the beginning of the input range
last iterator to the end of the input range
output iterator to the beginning of the output range
bop binary operator to apply to scan
uop unary operator to apply to transform each item before scan
buf pointer to the temporary buffer

template<typename P, typename I, typename O, typename C>
void tf::cuda_exclusive_scan(P&& p, I first, I last, O output, C op, void* buf)

performs asynchronous exclusive scan over a range of items

Template parameters
P execution policy type
I input iterator
O output iterator
C binary operator type
Parameters
p execution policy
first iterator to the beginning of the input range
last iterator to the end of the input range
output iterator to the beginning of the output range
op binary operator to apply to scan
buf pointer to the temporary buffer

template<typename P, typename I, typename O, typename C, typename U>
void tf::cuda_transform_exclusive_scan(P&& p, I first, I last, O output, C bop, U uop, void* buf)

performs asynchronous exclusive scan over a range of items

Template parameters
P execution policy type
I input iterator
O output iterator
C binary operator type
U unary operator type
Parameters
p execution policy
first iterator to the beginning of the input range
last iterator to the end of the input range
output iterator to the beginning of the output range
bop binary operator to apply to scan
uop unary operator to apply to transform each item before scan
buf pointer to the temporary buffer

template<typename P, typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>
void tf::cuda_merge_by_key(P&& p, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp, void* buf)

performs asynchronous key-value merge over a range of keys and values

Template parameters
P execution policy type
a_keys_it first key iterator type
a_vals_it first value iterator type
b_keys_it second key iterator type
b_vals_it second value iterator type
c_keys_it output key iterator type
c_vals_it output value iterator type
C comparator type
Parameters
p execution policy
a_keys_first iterator to the beginning of the first key range
a_keys_last iterator to the end of the first key range
a_vals_first iterator to the beginning of the first value range
b_keys_first iterator to the beginning of the second key range
b_keys_last iterator to the end of the second key range
b_vals_first iterator to the beginning of the second value range
c_keys_first iterator to the beginning of the output key range
c_vals_first iterator to the beginning of the output value range
comp comparator
buf pointer to the temporary buffer

Performs a key-value merge that copies elements from [a_keys_first, a_keys_last) and [b_keys_first, b_keys_last) into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending key order.

At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first)) and [b_vals_first + (b_keys_last - b_keys_first)) into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending order implied by each input element's associated key.

For example, assume:

  • a_keys = {1, 8};
  • a_vals = {2, 1};
  • b_keys = {3, 7};
  • b_vals = {3, 4};

After the merge, we have:

  • c_keys = {1, 3, 7, 8}
  • c_vals = {2, 3, 4, 1}

template<typename P, typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C>
void tf::cuda_merge(P&& p, a_keys_it a_keys_first, a_keys_it a_keys_last, b_keys_it b_keys_first, b_keys_it b_keys_last, c_keys_it c_keys_first, C comp, void* buf)

performs asynchronous key-only merge over a range of keys

Template parameters
P execution policy type
a_keys_it first key iterator type
b_keys_it second key iterator type
c_keys_it output key iterator type
C comparator type
Parameters
p execution policy
a_keys_first iterator to the beginning of the first key range
a_keys_last iterator to the end of the first key range
b_keys_first iterator to the beginning of the second key range
b_keys_last iterator to the end of the second key range
c_keys_first iterator to the beginning of the output key range
comp comparator
buf pointer to the temporary buffer

This function is equivalent to tf::cuda_merge_by_key without values.

template<typename P, typename K, typename V = cudaEmpty>
unsigned tf::cuda_sort_buffer_size(unsigned count)

queries the buffer size in bytes needed to call sort kernels for the given number of elements

Template parameters
P execution policy type
K key type
V value type (default tf::cudaEmpty)
Parameters
count number of keys/values to sort

The function is used to allocate a buffer for calling tf::cuda_sort.

template<typename P, typename K_it, typename V_it, typename C>
void tf::cuda_sort_by_key(P&& p, K_it k_first, K_it k_last, V_it v_first, C comp, void* buf)

performs asynchronous key-value sort on a range of items

Template parameters
P execution policy type
K_it key iterator type
V_it value iterator type
C comparator type
Parameters
p execution policy
k_first iterator to the beginning of the key range
k_last iterator to the end of the key range
v_first iterator to the beginning of the value range
comp binary comparator
buf pointer to the temporary buffer

Sorts key-value elements in [k_first, k_last) and [v_first, v_first + (k_last - k_first)) into ascending key order using the given comparator comp. If i and j are any two valid iterators in [k_first, k_last) such that i precedes j, and p and q are iterators in [v_first, v_first + (k_last - k_first)) corresponding to i and j respectively, then comp(*j, *i) evaluates to false.

For example, assume:

  • keys are {1, 4, 2, 8, 5, 7}
  • values are {'a', 'b', 'c', 'd', 'e', 'f'}

After sort:

  • keys are {1, 2, 4, 5, 7, 8}
  • values are {'a', 'c', 'b', 'e', 'f', 'd'}

template<typename P, typename K_it, typename C>
void tf::cuda_sort(P&& p, K_it k_first, K_it k_last, C comp, void* buf)

performs asynchronous key-only sort on a range of items

Template parameters
P execution policy type
K_it key iterator type
C comparator type
Parameters
p execution policy
k_first iterator to the beginning of the key range
k_last iterator to the end of the key range
comp binary comparator
buf pointer to the temporary buffer

This method is equivalent to tf::cuda_sort_by_key without values.

template<typename P, typename I, typename U>
void tf::cuda_find_if(P&& p, I first, I last, unsigned* idx, U op)

finds the index of the first element that satisfies the given criteria

Template parameters
P execution policy type
I input iterator type
U unary operator type
Parameters
p execution policy
first iterator to the beginning of the range
last iterator to the end of the range
idx pointer to the index of the found element
op unary operator which returns true for the required element

The function launches kernels asynchronously to find the index idx of the first element in the range [first, last) such that op(*(first+idx)) is true. This is equivalent to the parallel execution of the following loop:

unsigned idx = 0;
for(; first != last; ++first, ++idx) {
  if (p(*first)) {
    return idx;
  }
}
return idx;

template<typename P, typename I, typename O>
void tf::cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf)

finds the index of the minimum element in a range

Template parameters
P execution policy type
I input iterator type
O comparator type
Parameters
p execution policy object
first iterator to the beginning of the range
last iterator to the end of the range
idx solution index of the minimum element
op comparison function object
buf pointer to the buffer

The function launches kernels asynchronously to find the smallest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_min_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
  return 0;
}
auto smallest = first;
for (++first; first != last; ++first) {
  if (op(*first, *smallest)) {
    smallest = first;
  }
}
return std::distance(first, smallest);

template<typename P, typename I, typename O>
void tf::cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf)

finds the index of the maximum element in a range

Template parameters
P execution policy type
I input iterator type
O comparator type
Parameters
p execution policy object
first iterator to the beginning of the range
last iterator to the end of the range
idx solution index of the maximum element
op comparison function object
buf pointer to the buffer

The function launches kernels asynchronously to find the largest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_max_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
  return 0;
}
auto largest = first;
for (++first; first != last; ++first) {
  if (op(*largest, *first)) {
    largest = first;
  }
}
return std::distance(first, largest);

const char* tf::version() constexpr

queries the version information in a string format major.minor.patch

Release notes are available here: https://taskflow.github.io/taskflow/Releases.html

Variable documentation

template<typename P>
bool tf::is_task_params_v constexpr

determines if the given type is a task parameter type

Task parameters can be specified in one of the following types:

std::array<TaskType, 6> tf::TASK_TYPES constexpr

array of all task types (used for iterating task types)

template<typename C>
bool tf::is_subflow_task_v constexpr

determines if a callable is a dynamic task

A dynamic task is a callable object constructible from std::function<void(Subflow&)>.

template<typename C>
bool tf::is_condition_task_v constexpr

determines if a callable is a condition task

A condition task is a callable object constructible from std::function<int()> or std::function<int(tf::Runtime&)>.

template<typename C>
bool tf::is_multi_condition_task_v constexpr

determines if a callable is a multi-condition task

A multi-condition task is a callable object constructible from std::function<tf::SmallVector<int>()> or std::function<tf::SmallVector<int>(tf::Runtime&)>.

template<typename C>
bool tf::is_static_task_v constexpr

determines if a callable is a static task

A static task is a callable object constructible from std::function<void()> or std::function<void(tf::Runtime&)>.

template<typename P>
bool tf::is_partitioner_v constexpr

determines if a type is a partitioner

A partitioner is a derived type from tf::PartitionerBase.