tf namespace

taskflow namespace

Classes

class AsyncTask: class to create a dependent asynchronous task (async task)
template<typename T, size_t LogSize = TF_DEFAULT_BOUNDED_TASK_QUEUE_LOG_SIZE> class BoundedTaskQueue: class to create a lock-free bounded work-stealing queue
template<typename T> struct CachelineAligned: struct to ensure cacheline-aligned storage for an object.
class ChromeObserver: class to create an observer based on Chrome tracing format
template<typename Creator, typename Deleter> class cudaEventBase: class to create a smart pointer wrapper for managing cudaEvent_t
struct cudaEventCreator: functor to create a cudaEvent_t object
struct cudaEventDeleter: functor to delete a cudaEvent_t object
template<typename Creator, typename Deleter> class cudaGraphBase: class to create a CUDA graph managed by C++ smart pointer
struct cudaGraphCreator: a functor for creating a CUDA graph
struct cudaGraphDeleter: a functor for deleting a CUDA graph
template<typename Creator, typename Deleter> class cudaGraphExecBase: class to create an executable CUDA graph managed by C++ smart pointer
struct cudaGraphExecCreator: a functor for creating an executable CUDA graph
struct cudaGraphExecDeleter: a functor for deleting an executable CUDA graph
class cudaScopedDevice: class to create an RAII-styled context switch
template<typename Creator, typename Deleter> class cudaStreamBase: class to create a smart pointer wrapper for managing cudaStream_t
struct cudaStreamCreator: functor to create a cudaStream_t object
struct cudaStreamDeleter: functor to delete a cudaStream_t object
class cudaTask: class to create a task handle of a CUDA Graph node
template<typename Input, typename Output, typename C> class DataPipe: class to create a stage in a data-parallel pipeline
template<typename... Ps> class DataPipeline: class to create a data-parallel pipeline scheduling framework
struct DefaultClosureWrapper: default closure wrapper that simply runs the given closure as is
struct DefaultTaskParams: empty task parameter type for compile-time optimization
template<typename C = DefaultClosureWrapper> class DynamicPartitioner: class to construct a dynamic partitioner for scheduling parallel algorithms
class Executor: class to create an executor
class FlowBuilder: class to build a task dependency graph
template<typename T> class Future: class to access the result of an execution
class Graph: class to create a graph object
template<typename C = DefaultClosureWrapper> class GuidedPartitioner: class to construct a guided partitioner for scheduling parallel algorithms
template<typename T> class IndexRange: class to create an index range of integral indices with a step size
class ObserverInterface: class to derive an executor observer
template<typename C = DefaultClosureWrapper> class PartitionerBase: class to derive a partitioner for scheduling parallel algorithms
template<typename C = std::function<void(tf::Pipeflow&)>> class Pipe: class to create a pipe object for a pipeline stage
class Pipeflow: class to create a pipeflow object used by the pipe callable
template<typename... Ps> class Pipeline: class to create a pipeline scheduling framework
template<typename C = DefaultClosureWrapper> class RandomPartitioner: class to construct a random partitioner for scheduling parallel algorithms
class Runtime: class to include a runtime object in a task
template<typename P> class ScalablePipeline: class to create a scalable pipeline object
class Semaphore: class to create a semophore object for building a concurrency constraint
template<typename T, unsigned N = 2> class SmallVector: class to define a vector optimized for small array
template<typename C = DefaultClosureWrapper> class StaticPartitioner: class to construct a static partitioner for scheduling parallel algorithms
class Subflow: class to construct a subflow graph from the execution of a dynamic task
class Task: class to create a task handle over a node in a taskflow graph
class Taskflow: class to create a taskflow object
struct TaskParams: task parameters to use when creating an asynchronous task
class TaskView: class to access task information from the observer interface
class TFProfObserver: class to create an observer based on the built-in taskflow profiler format
template<typename T> class UnboundedTaskQueue: class to create a lock-free unbounded work-stealing queue
class Worker: class to create a worker in an executor
class WorkerInterface: class to configure worker behavior in an executor
class WorkerView: class to create an immutable view of a worker

Enums

enum class TaskType: int { PLACEHOLDER = 0, STATIC, RUNTIME, SUBFLOW, CONDITION, MODULE, ASYNC, UNDEFINED }: enumeration of all task types
enum class ObserverType: int { TFPROF = 0, CHROME, UNDEFINED }: enumeration of all observer types
enum class PartitionerType: int { STATIC, DYNAMIC }: enumeration of all partitioner types
enum class PipeType: int { PARALLEL = 1, SERIAL = 2 }: enumeration of all pipe types

Typedefs

using observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>: default time point type of observers
using DefaultPartitioner = GuidedPartitioner<>: default partitioner set to tf::GuidedPartitioner
using cudaEvent = cudaEventBase<cudaEventCreator, cudaEventDeleter>: default smart pointer type to manage a cudaEvent_t object with unique ownership
using cudaStream = cudaStreamBase<cudaStreamCreator, cudaStreamDeleter>: default smart pointer type to manage a cudaStream_t object with unique ownership
using cudaGraph = cudaGraphBase<cudaGraphCreator, cudaGraphDeleter>: default smart pointer type to manage a cudaGraph_t object with unique ownership
using cudaGraphExec = cudaGraphExecBase<cudaGraphExecCreator, cudaGraphExecDeleter>: default smart pointer type to manage a cudaGraphExec_t object with unique ownership

Functions

template<typename T, std::enable_if_t<(std::is_unsigned_v<std::decay_t<T>> && sizeof(T)==8), void>* = nullptr> auto next_pow2(T x) -> T constexpr: rounds the given 64-bit unsigned integer to the nearest power of 2
template<typename T, std::enable_if_t<std::is_integral_v<std::decay_t<T>>, void>* = nullptr> auto is_pow2(const T& x) -> bool constexpr: checks if the given number is a power of 2
template<typename T> auto floor_log2(T n) -> size_t constexpr: computes the floor of the base-2 logarithm of a number using count-leading-zeros (CTL).
template<size_t N> auto static_floor_log2() -> size_t constexpr: returns the floor of log2(N) at compile time
template<typename RandItr, typename C> auto median_of_three(RandItr l, RandItr m, RandItr r, C cmp) -> RandItr: finds the median of three numbers pointed to by iterators using the given comparator
template<typename RandItr, typename C> auto pseudo_median_of_nine(RandItr beg, RandItr end, C cmp) -> RandItr: finds the pseudo median of a range of items using a spread of nine numbers
template<typename Iter, typename Compare> void sort2(Iter a, Iter b, Compare comp): sorts two elements of dereferenced iterators using the given comparison function
template<typename Iter, typename Compare> void sort3(Iter a, Iter b, Iter c, Compare comp): Sorts three elements of dereferenced iterators using the given comparison function.
template<typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr> auto unique_id() -> T: generates a program-wide unique ID of the given type in a thread-safe manner
template<typename T> void atomic_max(std::atomic<T>& v, const T& max_v) noexcept: updates an atomic variable with the maximum value
template<typename T> void atomic_min(std::atomic<T>& v, const T& min_v) noexcept: updates an atomic variable with the minimum value
template<typename T> auto seed() -> T noexcept: generates a random seed based on the current system clock
template<typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>> auto ctz(T x) -> auto: counts the number of trailing zeros in an integer.
auto coprime(size_t N) -> size_t constexpr: computes a coprime of a given number
template<size_t N> auto make_coprime_lut() -> std::array<size_t, N> constexpr: generates a compile-time array of coprimes for numbers from 0 to N-1
auto get_env(const std::string& str) -> std::string: retrieves the value of an environment variable
auto has_env(const std::string& str) -> bool: checks whether an environment variable is defined
void pause()
void pause(size_t count): pause CPU for a specified number of iterations
template<typename P> void spin_until(P&& predicate): spins until the given predicate becomes true
template<typename B, typename E, typename S> auto is_index_range_invalid(B beg, E end, S step) -> std::enable_if_t<std::is_integral_v<std::decay_t<B>> && std::is_integral_v<std::decay_t<E>> && std::is_integral_v<std::decay_t<S>>, bool> constexpr: checks if the given index range is invalid
template<typename B, typename E, typename S> auto distance(B beg, E end, S step) -> std::enable_if_t<std::is_integral_v<std::decay_t<B>> && std::is_integral_v<std::decay_t<E>> && std::is_integral_v<std::decay_t<S>>, size_t> constexpr: calculates the number of iterations in the given index range
template<typename T, typename... ArgsT> auto make_worker_interface(ArgsT && ... args) -> std::unique_ptr<T>: helper function to create an instance derived from tf::WorkerInterface
auto to_string(TaskType type) -> const char*: convert a task type to a human-readable string
auto operator<<(std::ostream& os, const Task& task) -> std::ostream&: overload of ostream inserter operator for Task
auto to_string(ObserverType type) -> const char*: convert an observer type to a human-readable string
template<typename Input, typename Output, typename C> auto make_data_pipe(PipeType d, C&& callable) -> auto: function to construct a data pipe (tf::DataPipe)
template<typename T> auto make_module_task(T&& target) -> auto: creates a module task using the given target
auto cuda_get_num_devices() -> size_t: queries the number of available devices
auto cuda_get_device() -> int: gets the current device associated with the caller thread
void cuda_set_device(int id): switches to a given device context
void cuda_get_device_property(int i, cudaDeviceProp& p): obtains the device property
auto cuda_get_device_property(int i) -> cudaDeviceProp: obtains the device property
void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p): dumps the device property
auto cuda_get_device_max_threads_per_block(int d) -> size_t: queries the maximum threads per block on a device
auto cuda_get_device_max_x_dim_per_block(int d) -> size_t: queries the maximum x-dimension per block on a device
auto cuda_get_device_max_y_dim_per_block(int d) -> size_t: queries the maximum y-dimension per block on a device
auto cuda_get_device_max_z_dim_per_block(int d) -> size_t: queries the maximum z-dimension per block on a device
auto cuda_get_device_max_x_dim_per_grid(int d) -> size_t: queries the maximum x-dimension per grid on a device
auto cuda_get_device_max_y_dim_per_grid(int d) -> size_t: queries the maximum y-dimension per grid on a device
auto cuda_get_device_max_z_dim_per_grid(int d) -> size_t: queries the maximum z-dimension per grid on a device
auto cuda_get_device_max_shm_per_block(int d) -> size_t: queries the maximum shared memory size in bytes per block on a device
auto cuda_get_device_warp_size(int d) -> size_t: queries the warp size on a device
auto cuda_get_device_compute_capability_major(int d) -> int: queries the major number of compute capability of a device
auto cuda_get_device_compute_capability_minor(int d) -> int: queries the minor number of compute capability of a device
auto cuda_get_device_unified_addressing(int d) -> bool: queries if the device supports unified addressing
auto cuda_get_driver_version() -> int: queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
auto cuda_get_runtime_version() -> int: queries the CUDA Runtime version (1000 * major + 10 * minor)
auto cuda_get_free_mem(int d) -> size_t: queries the free memory (expensive call)
auto cuda_get_total_mem(int d) -> size_t: queries the total available memory (expensive call)
template<typename T> auto cuda_malloc_device(size_t N, int d) -> T*: allocates memory on the given device for holding N elements of type T
template<typename T> auto cuda_malloc_device(size_t N) -> T*: allocates memory on the current device associated with the caller
template<typename T> auto cuda_malloc_shared(size_t N) -> T*: allocates shared memory for holding N elements of type T
template<typename T> void cuda_free(T* ptr, int d): frees memory on the GPU device
template<typename T> void cuda_free(T* ptr): frees memory on the GPU device
void cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count): copies data between host and device asynchronously through a stream
void cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count): initializes or sets GPU memory to the given value byte by byte
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> auto cuda_get_copy_parms(T* tgt, const T* src, size_t num) -> cudaMemcpy3DParms: gets the memcpy node parameter of a copy task
auto cuda_get_memcpy_parms(void* tgt, const void* src, size_t bytes) -> cudaMemcpy3DParms: gets the memcpy node parameter of a memcpy task (untyped)
auto cuda_get_memset_parms(void* dst, int ch, size_t count) -> cudaMemsetParams: gets the memset node parameter of a memcpy task (untyped)
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> auto cuda_get_fill_parms(T* dst, T value, size_t count) -> cudaMemsetParams: gets the memset node parameter of a fill task (typed)
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> auto cuda_get_zero_parms(T* dst, size_t count) -> cudaMemsetParams: gets the memset node parameter of a zero task (typed)
auto cuda_graph_get_num_root_nodes(cudaGraph_t graph) -> size_t: queries the number of root nodes in a native CUDA graph
auto cuda_graph_get_num_nodes(cudaGraph_t graph) -> size_t: queries the number of nodes in a native CUDA graph
auto cuda_graph_get_num_edges(cudaGraph_t graph) -> size_t: queries the number of edges in a native CUDA graph
auto cuda_graph_get_nodes(cudaGraph_t graph) -> std::vector<cudaGraphNode_t>: acquires the nodes in a native CUDA graph
auto cuda_graph_get_root_nodes(cudaGraph_t graph) -> std::vector<cudaGraphNode_t>: acquires the root nodes in a native CUDA graph
auto cuda_graph_get_edges(cudaGraph_t graph) -> std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>>: acquires the edges in a native CUDA graph
auto cuda_get_graph_node_type(cudaGraphNode_t node) -> cudaGraphNodeType: queries the type of a native CUDA graph node
auto to_string(cudaGraphNodeType type) -> const char* constexpr: convert a cuda_task type to a human-readable string
auto operator<<(std::ostream& os, const cudaTask& ct) -> std::ostream&: overload of ostream inserter operator for cudaTask
auto version() -> const char* constexpr: queries the version information in a string format major.minor.patch

Variables

template<typename P> bool is_task_params_v constexpr: determines if the given type is a task parameter type
template<typename T> bool has_graph_v constexpr: determines if the given type has a member function Graph& graph()
std::array<TaskType, 7> TASK_TYPES constexpr: array of all task types (used for iterating task types)
template<typename C> bool is_static_task_v constexpr: determines if a callable is a static task
template<typename C> bool is_subflow_task_v constexpr: determines if a callable is a subflow task
template<typename C> bool is_runtime_task_v constexpr: determines if a callable is a runtime task
template<typename C> bool is_condition_task_v constexpr: determines if a callable is a condition task
template<typename C> bool is_multi_condition_task_v constexpr: determines if a callable is a multi-condition task
template<typename P> bool is_partitioner_v constexpr: determines if a type is a partitioner

Enum documentation

enum class tf::TaskType: int
#include <taskflow/core/task.hpp>

enumeration of all task types

Enumerators
PLACEHOLDER	placeholder task type
STATIC	static task type
RUNTIME	runtime task type
SUBFLOW	dynamic (subflow) task type
CONDITION	condition task type
MODULE	module task type
ASYNC	asynchronous task type
UNDEFINED	undefined task type (for internal use only)

enum class tf::ObserverType: int
#include <taskflow/core/observer.hpp>

enumeration of all observer types

enum class tf::PartitionerType: int
#include <taskflow/algorithm/partitioner.hpp>

enumeration of all partitioner types

Enumerators
STATIC	static partitioner type
DYNAMIC	dynamic partitioner type

enum class tf::PipeType: int
#include <taskflow/algorithm/pipeline.hpp>

enumeration of all pipe types

Enumerators
PARALLEL	parallel type
SERIAL	serial type

Typedef documentation

using tf::observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>
#include <taskflow/core/observer.hpp>

default time point type of observers

using tf::DefaultPartitioner = GuidedPartitioner<>
#include <taskflow/algorithm/partitioner.hpp>

default partitioner set to tf::GuidedPartitioner

Guided partitioning algorithm can achieve stable and decent performance for most parallel algorithms.

using tf::cudaEvent = cudaEventBase<cudaEventCreator, cudaEventDeleter>
#include <taskflow/cuda/cuda_stream.hpp>

default smart pointer type to manage a cudaEvent_t object with unique ownership

using tf::cudaStream = cudaStreamBase<cudaStreamCreator, cudaStreamDeleter>
#include <taskflow/cuda/cuda_stream.hpp>

default smart pointer type to manage a cudaStream_t object with unique ownership

using tf::cudaGraph = cudaGraphBase<cudaGraphCreator, cudaGraphDeleter>
#include <taskflow/cuda/cudaflow.hpp>

default smart pointer type to manage a cudaGraph_t object with unique ownership

using tf::cudaGraphExec = cudaGraphExecBase<cudaGraphExecCreator, cudaGraphExecDeleter>
#include <taskflow/cuda/cudaflow.hpp>

default smart pointer type to manage a cudaGraphExec_t object with unique ownership

Function documentation

template<typename T, std::enable_if_t<(std::is_unsigned_v<std::decay_t<T>> && sizeof(T)==8), void>* = nullptr>
T tf::next_pow2(T x) constexpr

rounds the given 64-bit unsigned integer to the nearest power of 2

rounds the given 32-bit unsigned integer to the nearest power of 2

template<typename T, std::enable_if_t<std::is_integral_v<std::decay_t<T>>, void>* = nullptr>
bool tf::is_pow2(const T& x) constexpr

checks if the given number is a power of 2

Template parameters
T	The type of the input. Must be an integral type.
Parameters
x	The integer to check.
Returns	`true` if `x` is a power of 2, otherwise `false`.

This function determines if the given integer is a power of 2.

template<typename T>
size_t tf::floor_log2(T n) constexpr

computes the floor of the base-2 logarithm of a number using count-leading-zeros (CTL).

Template parameters
T	integer type (uint32_t or uint64_t).
Parameters
n	input number.
Returns	floor of `log2(n)`

This function efficiently calculates the floor of log2(n) for both 32-bit and 64-bit integers.

template<typename RandItr, typename C>
RandItr tf::median_of_three(RandItr l, RandItr m, RandItr r, C cmp)

finds the median of three numbers pointed to by iterators using the given comparator

Template parameters
RandItr	The type of the random-access iterator.
C	The type of the comparator.
Parameters
l	Iterator to the first element.
m	Iterator to the second element.
r	Iterator to the third element.
cmp	The comparator used to compare the dereferenced iterator values.
Returns	The iterator pointing to the median value among the three elements.

This function determines the median value of the elements pointed to by three random-access iterators using the provided comparator.

template<typename RandItr, typename C>
RandItr tf::pseudo_median_of_nine(RandItr beg, RandItr end, C cmp)

finds the pseudo median of a range of items using a spread of nine numbers

Template parameters
RandItr	The type of the random-access iterator.
C	The type of the comparator.
Parameters
beg	Iterator to the beginning of the range.
end	Iterator to the end of the range.
cmp	The comparator used to compare the dereferenced iterator values.
Returns	The iterator pointing to the pseudo median of the range.

This function computes an approximate median of a range of items by sampling nine values spread across the range and finding their median. It uses a combination of the median_of_three function to determine the pseudo median.

template<typename Iter, typename Compare>
void tf::sort2(Iter a, Iter b, Compare comp)

sorts two elements of dereferenced iterators using the given comparison function

Template parameters
Iter	The type of the iterator.
Compare	The type of the comparator.
Parameters
a	Iterator to the first element.
b	Iterator to the second element.
comp	The comparator used to compare the dereferenced iterator values.

This function compares two elements pointed to by iterators and swaps them if they are out of order according to the provided comparator.

template<typename Iter, typename Compare>
void tf::sort3(Iter a, Iter b, Iter c, Compare comp)

Sorts three elements of dereferenced iterators using the given comparison function.

Template parameters
Iter	The type of the iterator.
Compare	The type of the comparator.
Parameters
a	Iterator to the first element.
b	Iterator to the second element.
c	Iterator to the third element.
comp	The comparator used to compare the dereferenced iterator values.

This function sorts three elements pointed to by iterators in ascending order according to the provided comparator. The sorting is performed using a sequence of calls to the sort2 function to ensure the correct order of elements.

template<typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr>
T tf::unique_id()

generates a program-wide unique ID of the given type in a thread-safe manner

Template parameters
T	The type of the ID to generate. Must be an integral type.
Returns	A unique ID of type `T`.

This function provides a globally unique identifier of the specified integral type. It uses a static std::atomic counter to ensure thread safety and increments the counter in a relaxed memory ordering for efficiency.

template<typename T>
void tf::atomic_max(std::atomic<T>& v, const T& max_v) noexcept

updates an atomic variable with the maximum value

Template parameters
T	The type of the atomic variable. Must be trivially copyable and comparable.
Parameters
v	The atomic variable to update.
max_v	The value to compare with the current value of `v`.

This function atomically updates the provided atomic variable v to hold the maximum of its current value and max_v. The update is performed using a relaxed memory ordering for efficiency in non-synchronizing contexts.

template<typename T>
void tf::atomic_min(std::atomic<T>& v, const T& min_v) noexcept

updates an atomic variable with the minimum value

Template parameters
T	The type of the atomic variable. Must be trivially copyable and comparable.
Parameters
v	The atomic variable to update.
min_v	The value to compare with the current value of `v`.

This function atomically updates the provided atomic variable v to hold the minimum of its current value and min_v. The update is performed using a relaxed memory ordering for efficiency in non-synchronizing contexts.

template<typename T>
T tf::seed() noexcept

generates a random seed based on the current system clock

Template parameters
T	The type of the returned seed. Must be an integral type.
Returns	A seed value based on the system clock.

This function returns a seed value derived from the number of clock ticks since the epoch as measured by the system clock. The seed can be used to initialize random number generators.

template<typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
auto tf::ctz(T x)

counts the number of trailing zeros in an integer.

Template parameters
T	integer type (32-bit or 64-bit).
Parameters
x	non-zero integer to count trailing zeros from
Returns	the number of trailing zeros in `x`

This function provides a portable implementation for counting the number of trailing zeros across different platforms and integer sizes (32-bit and 64-bit).

size_t tf::coprime(size_t N) constexpr

computes a coprime of a given number

Parameters
N	input number for which a coprime is to be found.
Returns	the largest number < `N` that is coprime to N

This function finds the largest number less than N that is coprime (i.e., has a greatest common divisor of 1) with N. If N is less than 3, it returns 1 as a default coprime.

template<size_t N>
std::array<size_t, N> tf::make_coprime_lut() constexpr

generates a compile-time array of coprimes for numbers from 0 to N-1

Template parameters
N	the size of the array to generate (should be greater than 0).
Returns	a constexpr array of size `N` where each index holds a coprime of its value.

This function constructs a constexpr array where each element at index i contains a coprime of i (the largest number less than i that is coprime to it).

std::string tf::get_env(const std::string& str)

retrieves the value of an environment variable

Parameters
str	The name of the environment variable to retrieve.
Returns	The value of the environment variable as a string, or an empty string if not found.

This function fetches the value of an environment variable by name. If the variable is not found, it returns an empty string.

bool tf::has_env(const std::string& str)

checks whether an environment variable is defined

Parameters
str	The name of the environment variable to check.
Returns	`true` if the environment variable exists, `false` otherwise.

This function determines if a specific environment variable exists in the current environment.

void tf::pause()

This function is used in spin-wait loops to hint the CPU that the current thread is in a busy-wait state. It helps reduce power consumption and improves performance on hyper-threaded processors by preventing the CPU from consuming unnecessary cycles while waiting. It is particularly useful in low-contention scenarios, where the thread is likely to quickly acquire the lock or condition it's waiting for, avoiding an expensive context switch. On modern x86 processors, this instruction can be invoked using __builtin_ia32_pause() in GCC/Clang or _mm_pause() in MSVC. In non-x86 architectures, alternative mechanisms such as yielding the CPU may be used instead.

template<typename P>
void tf::spin_until(P&& predicate)

spins until the given predicate becomes true

Template parameters
P	the type of the predicate function or callable.
Parameters
predicate	the callable that returns a boolean value, which is checked in the loop.

This function repeatedly checks the provided predicate in a spin-wait loop and uses a backoff strategy to minimize CPU waste during the wait. Initially, it uses the pause() instruction for the first 100 iterations to hint to the CPU that the thread is waiting, thus reducing power consumption and avoiding unnecessary cycles. After 100 iterations, it switches to yielding the CPU using std::this_thread::yield() to allow other threads to run and improve system responsiveness.

The function operates as follows:

For the first 100 iterations, it invokes pause() to reduce power consumption during the spin-wait.
After 100 iterations, it uses std::this_thread::yield() to relinquish the CPU, allowing other threads to execute.

template<typename B, typename E, typename S>
std::enable_if_t<std::is_integral_v<std::decay_t<B>> && std::is_integral_v<std::decay_t<E>> && std::is_integral_v<std::decay_t<S>>, bool> tf::is_index_range_invalid(B beg, E end, S step) constexpr

checks if the given index range is invalid

Template parameters
B	type of the beginning index
E	type of the ending index
S	type of the step size
Parameters
beg	starting index of the range
end	ending index of the range
step	step size to traverse the range
Returns	returns `true` if the range is invalid; `false` otherwise.

A range is considered invalid under the following conditions:

The step is zero and the begin and end values are not equal.
A positive range (begin < end) with a non-positive step.
A negative range (begin > end) with a non-negative step.

template<typename B, typename E, typename S>
std::enable_if_t<std::is_integral_v<std::decay_t<B>> && std::is_integral_v<std::decay_t<E>> && std::is_integral_v<std::decay_t<S>>, size_t> tf::distance(B beg, E end, S step) constexpr

calculates the number of iterations in the given index range

Template parameters
B	type of the beginning index
E	type of the ending index
S	type of the step size
Parameters
beg	starting index of the range
end	ending index of the range
step	step size to traverse the range
Returns	returns the number of required iterations to traverse the range

The distance of a range represents the number of required iterations to traverse the range from the beginning index to the ending index (exclusive) with the given step size.

Example 1:

// Range: 0 to 10 with step size 2
size_t dist = distance(0, 10, 2);  // Returns 5, the sequence is [0, 2, 4, 6, 8]

Example 2:

// Range: 10 to 0 with step size -2
size_t dist = distance(10, 0, -2);  // Returns 5, the sequence is [10, 8, 6, 4, 2]

Example 3:

// Range: 5 to 20 with step size 5
size_t dist = distance(5, 20, 5);  // Returns 3, the sequence is [5, 10, 15]

#include <taskflow/core/worker.hpp>

template<typename T, typename... ArgsT>
std::unique_ptr<T> tf::make_worker_interface(ArgsT && ... args)

helper function to create an instance derived from tf::WorkerInterface

Template parameters
T	type derived from tf::WorkerInterface
ArgsT	argument types to construct `T`
Parameters
args	arguments to forward to the constructor of `T`

const char* tf::to_string(TaskType type)
#include <taskflow/core/task.hpp>

convert a task type to a human-readable string

The name of each task type is the litte-case string of its characters.

TaskType::PLACEHOLDER     ->  "placeholder"
TaskType::STATIC          ->  "static"
TaskType::RUNTIME         ->  "runtime"
TaskType::SUBFLOW         ->  "subflow"
TaskType::CONDITION       ->  "condition"
TaskType::MODULE          ->  "module"
TaskType::ASYNC           ->  "async"

std::ostream& tf::operator<<(std::ostream& os, const Task& task)
#include <taskflow/core/task.hpp>

overload of ostream inserter operator for Task

const char* tf::to_string(ObserverType type)
#include <taskflow/core/observer.hpp>

convert an observer type to a human-readable string

template<typename Input, typename Output, typename C>
auto tf::make_data_pipe(PipeType d, C&& callable)

function to construct a data pipe (tf::DataPipe)

Template parameters
Input	input data type
Output	output data type
C	callable type

tf::make_data_pipe is a helper function to create a data pipe (tf::DataPipe) in a data-parallel pipeline (tf::DataPipeline). The first argument specifies the direction of the data pipe, either tf::PipeType::SERIAL or tf::PipeType::PARALLEL, and the second argument is a callable to invoke by the pipeline scheduler. Input and output data types are specified via template parameters, which will always be decayed by the library to its original form for storage purpose. The callable must take the input data type in its first argument and returns a value of the output data type.

tf::make_data_pipe<int, std::string>(
  tf::PipeType::SERIAL, 
  [](int& input) {
    return std::to_string(input + 100);
  }
);

The callable can additionally take a reference of tf::Pipeflow, which allows you to query the runtime information of a stage task, such as its line number and token number.

tf::make_data_pipe<int, std::string>(
  tf::PipeType::SERIAL, 
  [](int& input, tf::Pipeflow& pf) {
    printf("token=%lu, line=%lu\n", pf.token(), pf.line());
    return std::to_string(input + 100);
  }
);

template<typename T>
auto tf::make_module_task(T&& target)

creates a module task using the given target

Template parameters
T	Type of the target object, which must define the method `tf::Graph& graph()`.
Parameters
target	The target object used to create the module task.
Returns	module task that can be used by Taskflow or asynchronous tasking.

This example demonstrates how to create and launch multiple taskflows in parallel using asynchronous tasking:

tf::Executor executor;

tf::Taskflow A;
tf::Taskflow B;
tf::Taskflow C;
tf::Taskflow D;

A.emplace([](){ printf("Taskflow A\n"); }); 
B.emplace([](){ printf("Taskflow B\n"); }); 
C.emplace([](){ printf("Taskflow C\n"); }); 
D.emplace([](){ printf("Taskflow D\n"); }); 

// launch the four taskflows using asynchronous tasking
executor.async(tf::make_module_task(A));
executor.async(tf::make_module_task(B));
executor.async(tf::make_module_task(C));
executor.async(tf::make_module_task(D));
executor.wait_for_all();

The module task maker, tf::make_module_task, is basically the same as tf::Taskflow::composed_of but provides a more generic interface that can be used beyond Taskflow. For instance, the following two approaches achieve the same functionality.

// approach 1: composition using composed_of
tf::Task m1 = taskflow1.composed_of(taskflow2);

// approach 2: composition using make_module_task
tf::Task m1 = taskflow1.emplace(tf::make_module_task(taskflow2));

size_t tf::cuda_get_num_devices()
#include <taskflow/cuda/cuda_device.hpp>

queries the number of available devices

int tf::cuda_get_device()
#include <taskflow/cuda/cuda_device.hpp>

gets the current device associated with the caller thread

void tf::cuda_set_device(int id)
#include <taskflow/cuda/cuda_device.hpp>

switches to a given device context

void tf::cuda_get_device_property(int i, cudaDeviceProp& p)
#include <taskflow/cuda/cuda_device.hpp>

obtains the device property

cudaDeviceProp tf::cuda_get_device_property(int i)
#include <taskflow/cuda/cuda_device.hpp>

obtains the device property

void tf::cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p)
#include <taskflow/cuda/cuda_device.hpp>

dumps the device property

size_t tf::cuda_get_device_max_threads_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>

queries the maximum threads per block on a device

size_t tf::cuda_get_device_max_x_dim_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>

queries the maximum x-dimension per block on a device

size_t tf::cuda_get_device_max_y_dim_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>

queries the maximum y-dimension per block on a device

size_t tf::cuda_get_device_max_z_dim_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>

queries the maximum z-dimension per block on a device

size_t tf::cuda_get_device_max_x_dim_per_grid(int d)
#include <taskflow/cuda/cuda_device.hpp>

queries the maximum x-dimension per grid on a device

size_t tf::cuda_get_device_max_y_dim_per_grid(int d)
#include <taskflow/cuda/cuda_device.hpp>

queries the maximum y-dimension per grid on a device

size_t tf::cuda_get_device_max_z_dim_per_grid(int d)
#include <taskflow/cuda/cuda_device.hpp>

queries the maximum z-dimension per grid on a device

size_t tf::cuda_get_device_max_shm_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>

queries the maximum shared memory size in bytes per block on a device

size_t tf::cuda_get_device_warp_size(int d)
#include <taskflow/cuda/cuda_device.hpp>

queries the warp size on a device

int tf::cuda_get_device_compute_capability_major(int d)
#include <taskflow/cuda/cuda_device.hpp>

queries the major number of compute capability of a device

int tf::cuda_get_device_compute_capability_minor(int d)
#include <taskflow/cuda/cuda_device.hpp>

queries the minor number of compute capability of a device

bool tf::cuda_get_device_unified_addressing(int d)
#include <taskflow/cuda/cuda_device.hpp>

queries if the device supports unified addressing

int tf::cuda_get_driver_version()
#include <taskflow/cuda/cuda_device.hpp>

queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver

int tf::cuda_get_runtime_version()
#include <taskflow/cuda/cuda_device.hpp>

queries the CUDA Runtime version (1000 * major + 10 * minor)

size_t tf::cuda_get_free_mem(int d)
#include <taskflow/cuda/cuda_memory.hpp>

queries the free memory (expensive call)

size_t tf::cuda_get_total_mem(int d)
#include <taskflow/cuda/cuda_memory.hpp>

queries the total available memory (expensive call)

#include <taskflow/cuda/cuda_memory.hpp>

template<typename T>
T* tf::cuda_malloc_device(size_t N, int d)

allocates memory on the given device for holding N elements of type T

The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory.

#include <taskflow/cuda/cuda_memory.hpp>

template<typename T>
T* tf::cuda_malloc_device(size_t N)

allocates memory on the current device associated with the caller

The function calls malloc_device from the current device associated with the caller.

#include <taskflow/cuda/cuda_memory.hpp>

template<typename T>
T* tf::cuda_malloc_shared(size_t N)

allocates shared memory for holding N elements of type T

The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory.

#include <taskflow/cuda/cuda_memory.hpp>

template<typename T>
void tf::cuda_free(T* ptr, int d)

frees memory on the GPU device

Template parameters
T	pointer type
Parameters
ptr	device pointer to memory to free
d	device context identifier

This methods call cudaFree to free the memory space pointed to by ptr using the given device context.

#include <taskflow/cuda/cuda_memory.hpp>

template<typename T>
void tf::cuda_free(T* ptr)

frees memory on the GPU device

Template parameters
T	pointer type
Parameters
ptr	device pointer to memory to free

This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller.

void tf::cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)
#include <taskflow/cuda/cuda_memory.hpp>

copies data between host and device asynchronously through a stream

Parameters
stream	stream identifier
dst	destination memory address
src	source memory address
count	size in bytes to copy

The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap.

void tf::cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)
#include <taskflow/cuda/cuda_memory.hpp>

initializes or sets GPU memory to the given value byte by byte

Parameters
stream	stream identifier
devPtr	pointer to GPU memory
value	value to set for each byte of the specified memory
count	size in bytes to set

The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

cudaGraphNodeType tf::cuda_get_graph_node_type(cudaGraphNode_t node)

queries the type of a native CUDA graph node

valid type values are:

cudaGraphNodeTypeKernel = 0x00
cudaGraphNodeTypeMemcpy = 0x01
cudaGraphNodeTypeMemset = 0x02
cudaGraphNodeTypeHost = 0x03
cudaGraphNodeTypeGraph = 0x04
cudaGraphNodeTypeEmpty = 0x05
cudaGraphNodeTypeWaitEvent = 0x06
cudaGraphNodeTypeEventRecord = 0x07

const char* tf::version() constexpr
#include <taskflow/taskflow.hpp>

queries the version information in a string format major.minor.patch

Release notes are available here: https://taskflow.github.io/taskflow/Releases.html

Variable documentation

#include <taskflow/core/graph.hpp>

template<typename P>
bool tf::is_task_params_v constexpr

determines if the given type is a task parameter type

Task parameters can be specified in one of the following types:

tf::TaskParams: assign the struct of defined parameters
tf::DefaultTaskParams: assign nothing
std::string: assign a name to the task

#include <taskflow/core/graph.hpp>

template<typename T>
bool tf::has_graph_v constexpr

determines if the given type has a member function Graph& graph()

Template parameters
T	The type to inspect.

This trait determines if the provided type T contains a member function with the exact signature tf::Graph& graph(). It uses SFINAE and std::void_t to detect the presence of the member function and its return type.

Example usage:

struct A {
  tf::Graph& graph() { return my_graph; };
  tf::Graph my_graph;

  // other custom members to alter my_graph
};

struct C {}; // No graph function

static_assert(has_graph_v<A>, "A has graph()");
static_assert(!has_graph_v<C>, "C does not have graph()");

std::array<TaskType, 7> tf::TASK_TYPES constexpr
#include <taskflow/core/task.hpp>

array of all task types (used for iterating task types)

#include <taskflow/core/task.hpp>

template<typename C>
bool tf::is_static_task_v constexpr

determines if a callable is a static task

A static task is a callable object constructible from std::function<void()>.

#include <taskflow/core/task.hpp>

template<typename C>
bool tf::is_subflow_task_v constexpr

determines if a callable is a subflow task

A subflow task is a callable object constructible from std::function<void(Subflow&)>.

#include <taskflow/core/task.hpp>

template<typename C>
bool tf::is_runtime_task_v constexpr

determines if a callable is a runtime task

A runtime task is a callable object constructible from std::function<void(Runtime&)>.

#include <taskflow/core/task.hpp>

template<typename C>
bool tf::is_condition_task_v constexpr

determines if a callable is a condition task

A condition task is a callable object constructible from std::function<int()>.

#include <taskflow/core/task.hpp>

template<typename C>
bool tf::is_multi_condition_task_v constexpr

determines if a callable is a multi-condition task

A multi-condition task is a callable object constructible from std::function<tf::SmallVector<int>()>.

#include <taskflow/algorithm/partitioner.hpp>

template<typename P>
bool tf::is_partitioner_v constexpr

determines if a type is a partitioner

A partitioner is a derived type from tf::PartitionerBase.

tf namespace

Classes

Enums

Typedefs

Functions

Variables

Enum documentation

enum class tf::TaskType: int #include <taskflow/core/task.hpp>

enum class tf::ObserverType: int #include <taskflow/core/observer.hpp>

enum class tf::PartitionerType: int #include <taskflow/algorithm/partitioner.hpp>

enum class tf::PipeType: int #include <taskflow/algorithm/pipeline.hpp>

Typedef documentation

using tf::observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock> #include <taskflow/core/observer.hpp>

using tf::DefaultPartitioner = GuidedPartitioner<> #include <taskflow/algorithm/partitioner.hpp>

using tf::cudaEvent = cudaEventBase<cudaEventCreator, cudaEventDeleter> #include <taskflow/cuda/cuda_stream.hpp>

using tf::cudaStream = cudaStreamBase<cudaStreamCreator, cudaStreamDeleter> #include <taskflow/cuda/cuda_stream.hpp>

using tf::cudaGraph = cudaGraphBase<cudaGraphCreator, cudaGraphDeleter> #include <taskflow/cuda/cudaflow.hpp>

using tf::cudaGraphExec = cudaGraphExecBase<cudaGraphExecCreator, cudaGraphExecDeleter> #include <taskflow/cuda/cudaflow.hpp>

Function documentation

template<typename T, std::enable_if_t<(std::is_unsigned_v<std::decay_t<T>> && sizeof(T)==8), void>* = nullptr> T tf::next_pow2(T x) constexpr

template<typename T, std::enable_if_t<std::is_integral_v<std::decay_t<T>>, void>* = nullptr> bool tf::is_pow2(const T& x) constexpr

template<typename T> size_t tf::floor_log2(T n) constexpr

template<typename RandItr, typename C> RandItr tf::median_of_three(RandItr l, RandItr m, RandItr r, C cmp)

template<typename RandItr, typename C> RandItr tf::pseudo_median_of_nine(RandItr beg, RandItr end, C cmp)

template<typename Iter, typename Compare> void tf::sort2(Iter a, Iter b, Compare comp)

template<typename Iter, typename Compare> void tf::sort3(Iter a, Iter b, Iter c, Compare comp)

template<typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr> T tf::unique_id()

template<typename T> void tf::atomic_max(std::atomic<T>& v, const T& max_v) noexcept

template<typename T> void tf::atomic_min(std::atomic<T>& v, const T& min_v) noexcept

template<typename T> T tf::seed() noexcept

template<typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>> auto tf::ctz(T x)

size_t tf::coprime(size_t N) constexpr

template<size_t N> std::array<size_t, N> tf::make_coprime_lut() constexpr

std::string tf::get_env(const std::string& str)

bool tf::has_env(const std::string& str)

void tf::pause()

template<typename P> void tf::spin_until(P&& predicate)

template<typename B, typename E, typename S> std::enable_if_t<std::is_integral_v<std::decay_t<B>> && std::is_integral_v<std::decay_t<E>> && std::is_integral_v<std::decay_t<S>>, bool> tf::is_index_range_invalid(B beg, E end, S step) constexpr

template<typename B, typename E, typename S> std::enable_if_t<std::is_integral_v<std::decay_t<B>> && std::is_integral_v<std::decay_t<E>> && std::is_integral_v<std::decay_t<S>>, size_t> tf::distance(B beg, E end, S step) constexpr

#include <taskflow/core/worker.hpp> template<typename T, typename... ArgsT> std::unique_ptr<T> tf::make_worker_interface(ArgsT && ... args)

const char* tf::to_string(TaskType type) #include <taskflow/core/task.hpp>

std::ostream& tf::operator<<(std::ostream& os, const Task& task) #include <taskflow/core/task.hpp>

const char* tf::to_string(ObserverType type) #include <taskflow/core/observer.hpp>

template<typename Input, typename Output, typename C> auto tf::make_data_pipe(PipeType d, C&& callable)

template<typename T> auto tf::make_module_task(T&& target)

size_t tf::cuda_get_num_devices() #include <taskflow/cuda/cuda_device.hpp>

int tf::cuda_get_device() #include <taskflow/cuda/cuda_device.hpp>

void tf::cuda_set_device(int id) #include <taskflow/cuda/cuda_device.hpp>

void tf::cuda_get_device_property(int i, cudaDeviceProp& p) #include <taskflow/cuda/cuda_device.hpp>

cudaDeviceProp tf::cuda_get_device_property(int i) #include <taskflow/cuda/cuda_device.hpp>

void tf::cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) #include <taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_threads_per_block(int d) #include <taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_x_dim_per_block(int d) #include <taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_y_dim_per_block(int d) #include <taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_z_dim_per_block(int d) #include <taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_x_dim_per_grid(int d) #include <taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_y_dim_per_grid(int d) #include <taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_z_dim_per_grid(int d) #include <taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_shm_per_block(int d) #include <taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_warp_size(int d) #include <taskflow/cuda/cuda_device.hpp>

int tf::cuda_get_device_compute_capability_major(int d) #include <taskflow/cuda/cuda_device.hpp>

int tf::cuda_get_device_compute_capability_minor(int d) #include <taskflow/cuda/cuda_device.hpp>

bool tf::cuda_get_device_unified_addressing(int d) #include <taskflow/cuda/cuda_device.hpp>

int tf::cuda_get_driver_version() #include <taskflow/cuda/cuda_device.hpp>

int tf::cuda_get_runtime_version() #include <taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_free_mem(int d) #include <taskflow/cuda/cuda_memory.hpp>

size_t tf::cuda_get_total_mem(int d) #include <taskflow/cuda/cuda_memory.hpp>

#include <taskflow/cuda/cuda_memory.hpp> template<typename T> T* tf::cuda_malloc_device(size_t N, int d)

#include <taskflow/cuda/cuda_memory.hpp> template<typename T> T* tf::cuda_malloc_device(size_t N)

#include <taskflow/cuda/cuda_memory.hpp> template<typename T> T* tf::cuda_malloc_shared(size_t N)

#include <taskflow/cuda/cuda_memory.hpp> template<typename T> void tf::cuda_free(T* ptr, int d)

#include <taskflow/cuda/cuda_memory.hpp> template<typename T> void tf::cuda_free(T* ptr)

void tf::cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count) #include <taskflow/cuda/cuda_memory.hpp>

void tf::cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count) #include <taskflow/cuda/cuda_memory.hpp>