tf namespace
taskflow namespace
Classes
- class AsyncTask
- class to create a dependent asynchronous task (async task)
-
template<typename T, size_t LogSize = TF_class BoundedTaskQueue
DEFAULT_ BOUNDED_ TASK_ QUEUE_ LOG_ SIZE> - class to create a lock-free bounded single-producer multiple-consumer queue
-
template<typename T>struct CachelineAligned
- struct to ensure cacheline-aligned storage for an object.
- class ChromeObserver
- class to create an observer based on Chrome tracing format
-
template<typename T>class cudaDeviceAllocator
- class to create a CUDA device allocator
- class cudaEvent
- class to create an RAII-styled wrapper over a native CUDA event
-
template<unsigned NT, unsigned VT>class cudaExecutionPolicy
- class to define execution policy for CUDA standard algorithms
- class cudaFlow
- class to create a cudaFlow task dependency graph
- class cudaFlowCapturer
- class to create a cudaFlow graph using stream capture
- class cudaFlowLinearOptimizer
- class to capture a linear CUDA graph using a sequential stream
- class cudaFlowRoundRobinOptimizer
- class to capture a CUDA graph using a round-robin algorithm
- class cudaFlowSequentialOptimizer
- class to capture a CUDA graph using a sequential stream
- class cudaScopedDevice
- class to create an RAII-styled context switch
- class cudaStream
- class to create an RAII-styled wrapper over a native CUDA stream
- class cudaTask
- class to create a task handle over an internal node of a cudaFlow graph
-
template<typename T>class cudaUSMAllocator
- class to create a unified shared memory (USM) allocator
-
template<typename Input, typename Output, typename C>class DataPipe
- class to create a stage in a data-parallel pipeline
-
template<typename... Ps>class DataPipeline
- class to create a data-parallel pipeline scheduling framework
- struct DefaultClosureWrapper
- default closure wrapper that simply runs the given closure as is
- struct DefaultTaskParams
- empty task parameter type for compile-time optimization
-
template<typename C = DefaultClosureWrapper>class DynamicPartitioner
- class to construct a dynamic partitioner for scheduling parallel algorithms
- class Executor
- class to create an executor
- class FlowBuilder
- class to build a task dependency graph
-
template<typename T>class Future
- class to access the result of an execution
- class Graph
- class to create a graph object
-
template<typename C = DefaultClosureWrapper>class GuidedPartitioner
- class to construct a guided partitioner for scheduling parallel algorithms
- class ObserverInterface
- class to derive an executor observer
-
template<typename C = DefaultClosureWrapper>class PartitionerBase
- class to derive a partitioner for scheduling parallel algorithms
-
template<typename C = std::class Pipe
function<void(tf:: Pipeflow&)>> - class to create a pipe object for a pipeline stage
- class Pipeflow
- class to create a pipeflow object used by the pipe callable
-
template<typename... Ps>class Pipeline
- class to create a pipeline scheduling framework
-
template<typename C = DefaultClosureWrapper>class RandomPartitioner
- class to construct a random partitioner for scheduling parallel algorithms
- class Runtime
- class to include a runtime object in a task
-
template<typename P>class ScalablePipeline
- class to create a scalable pipeline object
- class Semaphore
- class to create a semophore object for building a concurrency constraint
-
template<typename T, unsigned N = 2>class SmallVector
- class to define a vector optimized for small array
-
template<typename C = DefaultClosureWrapper>class StaticPartitioner
- class to construct a static partitioner for scheduling parallel algorithms
- class Subflow
- class to construct a subflow graph from the execution of a dynamic task
- class Task
- class to create a task handle over a node in a taskflow graph
- class Taskflow
- class to create a taskflow object
- struct TaskParams
- task parameters to use when creating an asynchronous task
- class TaskView
- class to access task information from the observer interface
- class TFProfObserver
- class to create an observer based on the built-in taskflow profiler format
-
template<typename T>class UnboundedTaskQueue
- class to create a lock-free unbounded single-producer multiple-consumer queue
- class Worker
- class to create a worker in an executor
- class WorkerView
- class to create an immutable view of a worker in an executor
Enums
- enum class TaskType: int { PLACEHOLDER = 0, STATIC, SUBFLOW, CONDITION, MODULE, ASYNC, UNDEFINED }
- enumeration of all task types
- enum class ObserverType: int { TFPROF = 0, CHROME, UNDEFINED }
- enumeration of all observer types
- enum class PartitionerType: int { STATIC, DYNAMIC }
- enumeration of all partitioner types
- enum class PipeType: int { PARALLEL = 1, SERIAL = 2 }
- enumeration of all pipe types
- enum class cudaTaskType: int { EMPTY = 0, HOST, MEMSET, MEMCPY, KERNEL, SUBFLOW, CAPTURE, UNDEFINED }
- enumeration of all cudaTask types
Typedefs
-
using observer_stamp_t = std::
chrono:: time_point<std:: chrono:: steady_clock> - default time point type of observers
- using DefaultPartitioner = GuidedPartitioner<>
- default partitioner set to tf::
GuidedPartitioner - using cudaDefaultExecutionPolicy = cudaExecutionPolicy<512, 7>
- default execution policy
Functions
-
template<typename T, std::enable_if_t<(std::is_unsigned_v<std::decay_t<T>> && sizeof(T)==8), void>* = nullptr>auto next_pow2(T x) -> T constexpr
- rounds the given 64-bit unsigned integer to the nearest power of 2
-
template<typename T, std::enable_if_t<std::is_integral_v<std::decay_t<T>>, void>* = nullptr>auto is_pow2(const T& x) -> bool constexpr
- checks if the given number is a power of 2
-
template<typename T>auto log2(T n) -> int constexpr
- Computes the floor of log2 of the given positive integer.
-
template<typename RandItr, typename C>auto median_of_three(RandItr l, RandItr m, RandItr r, C cmp) -> RandItr
- finds the median of three numbers pointed to by iterators using the given comparator
-
template<typename RandItr, typename C>auto pseudo_median_of_nine(RandItr beg, RandItr end, C cmp) -> RandItr
- finds the pseudo median of a range of items using a spread of nine numbers
-
template<typename Iter, typename Compare>void sort2(Iter a, Iter b, Compare comp)
- sorts two elements of dereferenced iterators using the given comparison function
-
template<typename Iter, typename Compare>void sort3(Iter a, Iter b, Iter c, Compare comp)
- Sorts three elements of dereferenced iterators using the given comparison function.
-
template<typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr>auto unique_id() -> T
- generates a program-wide unique ID of the given type in a thread-safe manner
-
template<typename T>void atomic_max(std::
atomic<T>& v, const T& max_v) noexcept - updates an atomic variable with the maximum value
-
template<typename T>void atomic_min(std::
atomic<T>& v, const T& min_v) noexcept - updates an atomic variable with the minimum value
-
template<typename T>auto seed() -> T noexcept
- generates a random seed based on the current system clock
-
auto get_env(const std::
string& str) -> std:: string - retrieves the value of an environment variable
-
auto has_env(const std::
string& str) -> bool - checks whether an environment variable is defined
- void pause()
-
template<typename P>void spin_until(P&& predicate)
- spins until the given predicate becomes true
- auto to_string(TaskType type) -> const char*
- convert a task type to a human-readable string
-
auto operator<<(std::
ostream& os, const Task& task) -> std:: ostream& - overload of ostream inserter operator for Task
-
template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>auto try_acquire(I first, I last) -> bool
- tries to acquire all semaphores in the specified range
-
template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>auto try_acquire(S && ... semaphores) -> bool
- tries to acquire all semaphores
-
template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>void release(I first, I last)
- tries to acquire all semaphores in the specified range
-
template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>void release(S && ... semaphores)
- tries to acquire all semaphores
- auto to_string(ObserverType type) -> const char*
- convert an observer type to a human-readable string
-
template<typename Input, typename Output, typename C>auto make_data_pipe(PipeType d, C&& callable) -> auto
- function to construct a data pipe (tf::
DataPipe) - auto cuda_get_num_devices() -> size_t
- queries the number of available devices
- auto cuda_get_device() -> int
- gets the current device associated with the caller thread
- void cuda_set_device(int id)
- switches to a given device context
- void cuda_get_device_property(int i, cudaDeviceProp& p)
- obtains the device property
- auto cuda_get_device_property(int i) -> cudaDeviceProp
- obtains the device property
-
void cuda_dump_device_property(std::
ostream& os, const cudaDeviceProp& p) - dumps the device property
- auto cuda_get_device_max_threads_per_block(int d) -> size_t
- queries the maximum threads per block on a device
- auto cuda_get_device_max_x_dim_per_block(int d) -> size_t
- queries the maximum x-dimension per block on a device
- auto cuda_get_device_max_y_dim_per_block(int d) -> size_t
- queries the maximum y-dimension per block on a device
- auto cuda_get_device_max_z_dim_per_block(int d) -> size_t
- queries the maximum z-dimension per block on a device
- auto cuda_get_device_max_x_dim_per_grid(int d) -> size_t
- queries the maximum x-dimension per grid on a device
- auto cuda_get_device_max_y_dim_per_grid(int d) -> size_t
- queries the maximum y-dimension per grid on a device
- auto cuda_get_device_max_z_dim_per_grid(int d) -> size_t
- queries the maximum z-dimension per grid on a device
- auto cuda_get_device_max_shm_per_block(int d) -> size_t
- queries the maximum shared memory size in bytes per block on a device
- auto cuda_get_device_warp_size(int d) -> size_t
- queries the warp size on a device
- auto cuda_get_device_compute_capability_major(int d) -> int
- queries the major number of compute capability of a device
- auto cuda_get_device_compute_capability_minor(int d) -> int
- queries the minor number of compute capability of a device
- auto cuda_get_device_unified_addressing(int d) -> bool
- queries if the device supports unified addressing
- auto cuda_get_driver_version() -> int
- queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
- auto cuda_get_runtime_version() -> int
- queries the CUDA Runtime version (1000 * major + 10 * minor)
- auto cuda_get_free_mem(int d) -> size_t
- queries the free memory (expensive call)
- auto cuda_get_total_mem(int d) -> size_t
- queries the total available memory (expensive call)
-
template<typename T>auto cuda_malloc_device(size_t N, int d) -> T*
- allocates memory on the given device for holding
N
elements of typeT
-
template<typename T>auto cuda_malloc_device(size_t N) -> T*
- allocates memory on the current device associated with the caller
-
template<typename T>auto cuda_malloc_shared(size_t N) -> T*
- allocates shared memory for holding
N
elements of typeT
-
template<typename T>void cuda_free(T* ptr, int d)
- frees memory on the GPU device
-
template<typename T>void cuda_free(T* ptr)
- frees memory on the GPU device
- void cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)
- copies data between host and device asynchronously through a stream
- void cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)
- initializes or sets GPU memory to the given value byte by byte
- auto to_string(cudaTaskType type) -> const char* constexpr
- convert a cuda_task type to a human-readable string
-
auto operator<<(std::
ostream& os, const cudaTask& ct) -> std:: ostream& - overload of ostream inserter operator for cudaTask
-
template<typename P, typename C>void cuda_single_task(P&& p, C c)
- runs a callable asynchronously using one kernel thread
-
template<typename P, typename I, typename C>void cuda_for_each(P&& p, I first, I last, C c)
- performs asynchronous parallel iterations over a range of items
-
template<typename P, typename I, typename C>void cuda_for_each_index(P&& p, I first, I last, I inc, C c)
- performs asynchronous parallel iterations over an index-based range of items
-
template<typename P, typename I, typename O, typename C>void cuda_transform(P&& p, I first, I last, O output, C op)
- performs asynchronous parallel transforms over a range of items
-
template<typename P, typename I1, typename I2, typename O, typename C>void cuda_transform(P&& p, I1 first1, I1 last1, I2 first2, O output, C op)
- performs asynchronous parallel transforms over two ranges of items
-
template<typename P, typename I, typename T, typename O>void cuda_reduce(P&& p, I first, I last, T* res, O op, void* buf)
- performs asynchronous parallel reduction over a range of items
-
template<typename P, typename I, typename T, typename O>void cuda_uninitialized_reduce(P&& p, I first, I last, T* res, O op, void* buf)
- performs asynchronous parallel reduction over a range of items without an initial value
-
template<typename P, typename I, typename T, typename O, typename U>void cuda_transform_reduce(P&& p, I first, I last, T* res, O bop, U uop, void* buf)
- performs asynchronous parallel reduction over a range of transformed items without an initial value
-
template<typename P, typename I, typename T, typename O, typename U>void cuda_uninitialized_transform_reduce(P&& p, I first, I last, T* res, O bop, U uop, void* buf)
- performs asynchronous parallel reduction over a range of transformed items with an initial value
-
template<typename P, typename I, typename O, typename C>void cuda_inclusive_scan(P&& p, I first, I last, O output, C op, void* buf)
- performs asynchronous inclusive scan over a range of items
-
template<typename P, typename I, typename O, typename C, typename U>void cuda_transform_inclusive_scan(P&& p, I first, I last, O output, C bop, U uop, void* buf)
- performs asynchronous inclusive scan over a range of transformed items
-
template<typename P, typename I, typename O, typename C>void cuda_exclusive_scan(P&& p, I first, I last, O output, C op, void* buf)
- performs asynchronous exclusive scan over a range of items
-
template<typename P, typename I, typename O, typename C, typename U>void cuda_transform_exclusive_scan(P&& p, I first, I last, O output, C bop, U uop, void* buf)
- performs asynchronous exclusive scan over a range of items
-
template<typename P, typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>void cuda_merge_by_key(P&& p, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp, void* buf)
- performs asynchronous key-value merge over a range of keys and values
-
template<typename P, typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C>void cuda_merge(P&& p, a_keys_it a_keys_first, a_keys_it a_keys_last, b_keys_it b_keys_first, b_keys_it b_keys_last, c_keys_it c_keys_first, C comp, void* buf)
- performs asynchronous key-only merge over a range of keys
-
template<typename P, typename K, typename V = cudaEmpty>auto cuda_sort_buffer_size(unsigned count) -> unsigned
- queries the buffer size in bytes needed to call sort kernels for the given number of elements
-
template<typename P, typename K_it, typename V_it, typename C>void cuda_sort_by_key(P&& p, K_it k_first, K_it k_last, V_it v_first, C comp, void* buf)
- performs asynchronous key-value sort on a range of items
-
template<typename P, typename K_it, typename C>void cuda_sort(P&& p, K_it k_first, K_it k_last, C comp, void* buf)
- performs asynchronous key-only sort on a range of items
-
template<typename P, typename I, typename U>void cuda_find_if(P&& p, I first, I last, unsigned* idx, U op)
- finds the index of the first element that satisfies the given criteria
-
template<typename P, typename I, typename O>void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf)
- finds the index of the minimum element in a range
-
template<typename P, typename I, typename O>void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf)
- finds the index of the maximum element in a range
- auto version() -> const char* constexpr
- queries the version information in a string format
major.minor.patch
Variables
-
template<typename P>bool is_task_params_v constexpr
- determines if the given type is a task parameter type
-
std::
array<TaskType, 6> TASK_TYPES constexpr - array of all task types (used for iterating task types)
-
template<typename C>bool is_subflow_task_v constexpr
- determines if a callable is a dynamic task
-
template<typename C>bool is_condition_task_v constexpr
- determines if a callable is a condition task
-
template<typename C>bool is_multi_condition_task_v constexpr
- determines if a callable is a multi-condition task
-
template<typename C>bool is_static_task_v constexpr
- determines if a callable is a static task
-
template<typename P>bool is_partitioner_v constexpr
- determines if a type is a partitioner
Enum documentation
enum class tf:: TaskType: int
#include <taskflow/core/task.hpp>
enumeration of all task types
Enumerators | |
---|---|
PLACEHOLDER |
placeholder task type |
STATIC |
static task type |
SUBFLOW |
dynamic (subflow) task type |
CONDITION |
condition task type |
MODULE |
module task type |
ASYNC |
asynchronous task type |
UNDEFINED |
undefined task type (for internal use only) |
enum class tf:: ObserverType: int
#include <taskflow/core/observer.hpp>
enumeration of all observer types
enum class tf:: PartitionerType: int
#include <taskflow/algorithm/partitioner.hpp>
enumeration of all partitioner types
Enumerators | |
---|---|
STATIC |
static partitioner type |
DYNAMIC |
dynamic partitioner type |
enum class tf:: PipeType: int
#include <taskflow/algorithm/pipeline.hpp>
enumeration of all pipe types
Enumerators | |
---|---|
PARALLEL |
parallel type |
SERIAL |
serial type |
enum class tf:: cudaTaskType: int
#include <taskflow/cuda/cuda_task.hpp>
enumeration of all cudaTask types
Enumerators | |
---|---|
EMPTY |
empty task type |
HOST |
host task type |
MEMSET |
memory set task type |
MEMCPY |
memory copy task type |
KERNEL |
memory copy task type |
SUBFLOW |
subflow (child graph) task type |
CAPTURE |
capture task type |
UNDEFINED |
undefined task type |
Typedef documentation
using tf:: observer_stamp_t = std:: chrono:: time_point<std:: chrono:: steady_clock>
#include <taskflow/core/observer.hpp>
default time point type of observers
using tf:: DefaultPartitioner = GuidedPartitioner<>
#include <taskflow/algorithm/partitioner.hpp>
default partitioner set to tf::
Guided partitioning algorithm can achieve stable and decent performance for most parallel algorithms.
using tf:: cudaDefaultExecutionPolicy = cudaExecutionPolicy<512, 7>
default execution policy
Function documentation
template<typename T, std::enable_if_t<(std::is_unsigned_v<std::decay_t<T>> && sizeof(T)==8), void>* = nullptr>
T tf:: next_pow2(T x) constexpr
rounds the given 64-bit unsigned integer to the nearest power of 2
rounds the given 32-bit unsigned integer to the nearest power of 2
template<typename T, std::enable_if_t<std::is_integral_v<std::decay_t<T>>, void>* = nullptr>
bool tf:: is_pow2(const T& x) constexpr
checks if the given number is a power of 2
Template parameters | |
---|---|
T | The type of the input. Must be an integral type. |
Parameters | |
x | The integer to check. |
Returns | true if x is a power of 2, otherwise false . |
This function determines if the given integer is a power of 2.
template<typename T>
int tf:: log2(T n) constexpr
Computes the floor of log2 of the given positive integer.
Template parameters | |
---|---|
T | The type of the input. Must be an integral type. |
Parameters | |
n | The positive integer to compute log2 for. Assumes n > 0 . |
Returns | The floor of log2 of n . |
This function calculates the largest integer log
such that 2^log <= n
.
template<typename RandItr, typename C>
RandItr tf:: median_of_three(RandItr l,
RandItr m,
RandItr r,
C cmp)
finds the median of three numbers pointed to by iterators using the given comparator
Template parameters | |
---|---|
RandItr | The type of the random-access iterator. |
C | The type of the comparator. |
Parameters | |
l | Iterator to the first element. |
m | Iterator to the second element. |
r | Iterator to the third element. |
cmp | The comparator used to compare the dereferenced iterator values. |
Returns | The iterator pointing to the median value among the three elements. |
This function determines the median value of the elements pointed to by three random-access iterators using the provided comparator.
template<typename RandItr, typename C>
RandItr tf:: pseudo_median_of_nine(RandItr beg,
RandItr end,
C cmp)
finds the pseudo median of a range of items using a spread of nine numbers
Template parameters | |
---|---|
RandItr | The type of the random-access iterator. |
C | The type of the comparator. |
Parameters | |
beg | Iterator to the beginning of the range. |
end | Iterator to the end of the range. |
cmp | The comparator used to compare the dereferenced iterator values. |
Returns | The iterator pointing to the pseudo median of the range. |
This function computes an approximate median of a range of items by sampling nine values spread across the range and finding their median. It uses a combination of the median_of_three
function to determine the pseudo median.
template<typename Iter, typename Compare>
void tf:: sort2(Iter a,
Iter b,
Compare comp)
sorts two elements of dereferenced iterators using the given comparison function
Template parameters | |
---|---|
Iter | The type of the iterator. |
Compare | The type of the comparator. |
Parameters | |
a | Iterator to the first element. |
b | Iterator to the second element. |
comp | The comparator used to compare the dereferenced iterator values. |
This function compares two elements pointed to by iterators and swaps them if they are out of order according to the provided comparator.
template<typename Iter, typename Compare>
void tf:: sort3(Iter a,
Iter b,
Iter c,
Compare comp)
Sorts three elements of dereferenced iterators using the given comparison function.
Template parameters | |
---|---|
Iter | The type of the iterator. |
Compare | The type of the comparator. |
Parameters | |
a | Iterator to the first element. |
b | Iterator to the second element. |
c | Iterator to the third element. |
comp | The comparator used to compare the dereferenced iterator values. |
This function sorts three elements pointed to by iterators in ascending order according to the provided comparator. The sorting is performed using a sequence of calls to the sort2
function to ensure the correct order of elements.
template<typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr>
T tf:: unique_id()
generates a program-wide unique ID of the given type in a thread-safe manner
Template parameters | |
---|---|
T | The type of the ID to generate. Must be an integral type. |
Returns | A unique ID of type T . |
This function provides a globally unique identifier of the specified integral type. It uses a static std::
counter to ensure thread safety and increments the counter in a relaxed memory ordering for efficiency.
template<typename T>
void tf:: atomic_max(std:: atomic<T>& v,
const T& max_v) noexcept
updates an atomic variable with the maximum value
Template parameters | |
---|---|
T | The type of the atomic variable. Must be trivially copyable and comparable. |
Parameters | |
v | The atomic variable to update. |
max_v | The value to compare with the current value of v . |
This function atomically updates the provided atomic variable v
to hold the maximum of its current value and max_v
. The update is performed using a relaxed memory ordering for efficiency in non-synchronizing contexts.
template<typename T>
void tf:: atomic_min(std:: atomic<T>& v,
const T& min_v) noexcept
updates an atomic variable with the minimum value
Template parameters | |
---|---|
T | The type of the atomic variable. Must be trivially copyable and comparable. |
Parameters | |
v | The atomic variable to update. |
min_v | The value to compare with the current value of v . |
This function atomically updates the provided atomic variable v
to hold the minimum of its current value and min_v
. The update is performed using a relaxed memory ordering for efficiency in non-synchronizing contexts.
template<typename T>
T tf:: seed() noexcept
generates a random seed based on the current system clock
Template parameters | |
---|---|
T | The type of the returned seed. Must be an integral type. |
Returns | A seed value based on the system clock. |
This function returns a seed value derived from the number of clock ticks since the epoch as measured by the system clock. The seed can be used to initialize random number generators.
std:: string tf:: get_env(const std:: string& str)
retrieves the value of an environment variable
Parameters | |
---|---|
str | The name of the environment variable to retrieve. |
Returns | The value of the environment variable as a string, or an empty string if not found. |
This function fetches the value of an environment variable by name. If the variable is not found, it returns an empty string.
bool tf:: has_env(const std:: string& str)
checks whether an environment variable is defined
Parameters | |
---|---|
str | The name of the environment variable to check. |
Returns | true if the environment variable exists, false otherwise. |
This function determines if a specific environment variable exists in the current environment.
void tf:: pause()
This function is used in spin-wait loops to hint the CPU that the current thread is in a busy-wait state. It helps reduce power consumption and improves performance on hyper-threaded processors by preventing the CPU from consuming unnecessary cycles while waiting. It is particularly useful in low-contention scenarios, where the thread is likely to quickly acquire the lock or condition it's waiting for, avoiding an expensive context switch. On modern x86 processors, this instruction can be invoked using __builtin_ia32_pause()
in GCC/Clang or _mm_pause()
in MSVC. In non-x86 architectures, alternative mechanisms such as yielding the CPU may be used instead.
template<typename P>
void tf:: spin_until(P&& predicate)
spins until the given predicate becomes true
Template parameters | |
---|---|
P | the type of the predicate function or callable. |
Parameters | |
predicate | the callable that returns a boolean value, which is checked in the loop. |
This function repeatedly checks the provided predicate in a spin-wait loop and uses a backoff strategy to minimize CPU waste during the wait. Initially, it uses the pause()
instruction for the first 100 iterations to hint to the CPU that the thread is waiting, thus reducing power consumption and avoiding unnecessary cycles. After 100 iterations, it switches to yielding the CPU using std::
to allow other threads to run and improve system responsiveness.
The function operates as follows:
- For the first 100 iterations, it invokes
pause()
to reduce power consumption during the spin-wait. - After 100 iterations, it uses
std::
to relinquish the CPU, allowing other threads to execute.this_thread:: yield()
const char* tf:: to_string(TaskType type)
#include <taskflow/core/task.hpp>
convert a task type to a human-readable string
The name of each task type is the litte-case string of its characters.
TaskType::PLACEHOLDER -> "placeholder" TaskType::STATIC -> "static" TaskType::SUBFLOW -> "subflow" TaskType::CONDITION -> "condition" TaskType::MODULE -> "module" TaskType::ASYNC -> "async"
std:: ostream& tf:: operator<<(std:: ostream& os,
const Task& task)
#include <taskflow/core/task.hpp>
overload of ostream inserter operator for Task
#include <taskflow/core/semaphore.hpp>
template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>
bool tf:: try_acquire(I first,
I last)
tries to acquire all semaphores in the specified range
Template parameters | |
---|---|
I | iterator type |
Parameters | |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
Returns | true if all semaphores are acquired, otherwise false |
Tries to acquire all semaphores in the specified range.
#include <taskflow/core/semaphore.hpp>
template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>
bool tf:: try_acquire(S && ... semaphores)
tries to acquire all semaphores
Parameters | |
---|---|
semaphores | semaphores to acquire |
Returns | true if all semaphores are acquired, otherwise false |
Tries to acquire all the semaphores.
#include <taskflow/core/semaphore.hpp>
template<typename I, std::enable_if_t<std::is_same_v<deref_t<I>, Semaphore>, void>* = nullptr>
void tf:: release(I first,
I last)
tries to acquire all semaphores in the specified range
Template parameters | |
---|---|
I | iterator type |
Parameters | |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
Releases all the semaphores in the given range.
#include <taskflow/core/semaphore.hpp>
template<typename... S, std::enable_if_t<all_same_v<Semaphore, std::decay_t<S>...>, void>* = nullptr>
void tf:: release(S && ... semaphores)
tries to acquire all semaphores
Parameters | |
---|---|
semaphores | semaphores to release |
Releases all the semaphores.
const char* tf:: to_string(ObserverType type)
#include <taskflow/core/observer.hpp>
convert an observer type to a human-readable string
template<typename Input, typename Output, typename C>
auto tf:: make_data_pipe(PipeType d,
C&& callable)
function to construct a data pipe (tf::
Template parameters | |
---|---|
Input | input data type |
Output | output data type |
C | callable type |
tf::
tf::make_data_pipe<int, std::string>( tf::PipeType::SERIAL, [](int& input) { return std::to_string(input + 100); } );
The callable can additionally take a reference of tf::
tf::make_data_pipe<int, std::string>( tf::PipeType::SERIAL, [](int& input, tf::Pipeflow& pf) { printf("token=%lu, line=%lu\n", pf.token(), pf.line()); return std::to_string(input + 100); } );
size_t tf:: cuda_get_num_devices()
#include <taskflow/cuda/cuda_device.hpp>
queries the number of available devices
int tf:: cuda_get_device()
#include <taskflow/cuda/cuda_device.hpp>
gets the current device associated with the caller thread
void tf:: cuda_set_device(int id)
#include <taskflow/cuda/cuda_device.hpp>
switches to a given device context
void tf:: cuda_get_device_property(int i,
cudaDeviceProp& p)
#include <taskflow/cuda/cuda_device.hpp>
obtains the device property
cudaDeviceProp tf:: cuda_get_device_property(int i)
#include <taskflow/cuda/cuda_device.hpp>
obtains the device property
void tf:: cuda_dump_device_property(std:: ostream& os,
const cudaDeviceProp& p)
#include <taskflow/cuda/cuda_device.hpp>
dumps the device property
size_t tf:: cuda_get_device_max_threads_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum threads per block on a device
size_t tf:: cuda_get_device_max_x_dim_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum x-dimension per block on a device
size_t tf:: cuda_get_device_max_y_dim_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum y-dimension per block on a device
size_t tf:: cuda_get_device_max_z_dim_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum z-dimension per block on a device
size_t tf:: cuda_get_device_max_x_dim_per_grid(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum x-dimension per grid on a device
size_t tf:: cuda_get_device_max_y_dim_per_grid(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum y-dimension per grid on a device
size_t tf:: cuda_get_device_max_z_dim_per_grid(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum z-dimension per grid on a device
size_t tf:: cuda_get_device_max_shm_per_block(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the maximum shared memory size in bytes per block on a device
size_t tf:: cuda_get_device_warp_size(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the warp size on a device
int tf:: cuda_get_device_compute_capability_major(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the major number of compute capability of a device
int tf:: cuda_get_device_compute_capability_minor(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries the minor number of compute capability of a device
bool tf:: cuda_get_device_unified_addressing(int d)
#include <taskflow/cuda/cuda_device.hpp>
queries if the device supports unified addressing
int tf:: cuda_get_driver_version()
#include <taskflow/cuda/cuda_device.hpp>
queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
int tf:: cuda_get_runtime_version()
#include <taskflow/cuda/cuda_device.hpp>
queries the CUDA Runtime version (1000 * major + 10 * minor)
size_t tf:: cuda_get_free_mem(int d)
#include <taskflow/cuda/cuda_memory.hpp>
queries the free memory (expensive call)
size_t tf:: cuda_get_total_mem(int d)
#include <taskflow/cuda/cuda_memory.hpp>
queries the total available memory (expensive call)
#include <taskflow/cuda/cuda_memory.hpp>
template<typename T>
T* tf:: cuda_malloc_device(size_t N,
int d)
allocates memory on the given device for holding N
elements of type T
The function calls cudaMalloc
to allocate N*sizeof(T)
bytes of memory on the given device d
and returns a pointer to the starting address of the device memory.
#include <taskflow/cuda/cuda_memory.hpp>
template<typename T>
T* tf:: cuda_malloc_device(size_t N)
allocates memory on the current device associated with the caller
The function calls malloc_device from the current device associated with the caller.
#include <taskflow/cuda/cuda_memory.hpp>
template<typename T>
T* tf:: cuda_malloc_shared(size_t N)
allocates shared memory for holding N
elements of type T
The function calls cudaMallocManaged
to allocate N*sizeof(T)
bytes of memory and returns a pointer to the starting address of the shared memory.
#include <taskflow/cuda/cuda_memory.hpp>
template<typename T>
void tf:: cuda_free(T* ptr,
int d)
frees memory on the GPU device
Template parameters | |
---|---|
T | pointer type |
Parameters | |
ptr | device pointer to memory to free |
d | device context identifier |
This methods call cudaFree
to free the memory space pointed to by ptr
using the given device context.
#include <taskflow/cuda/cuda_memory.hpp>
template<typename T>
void tf:: cuda_free(T* ptr)
frees memory on the GPU device
Template parameters | |
---|---|
T | pointer type |
Parameters | |
ptr | device pointer to memory to free |
This methods call cudaFree
to free the memory space pointed to by ptr
using the current device context of the caller.
void tf:: cuda_memcpy_async(cudaStream_t stream,
void* dst,
const void* src,
size_t count)
#include <taskflow/cuda/cuda_memory.hpp>
copies data between host and device asynchronously through a stream
Parameters | |
---|---|
stream | stream identifier |
dst | destination memory address |
src | source memory address |
count | size in bytes to copy |
The method calls cudaMemcpyAsync
with the given stream
using cudaMemcpyDefault
to infer the memory space of the source and the destination pointers. The memory areas may not overlap.
void tf:: cuda_memset_async(cudaStream_t stream,
void* devPtr,
int value,
size_t count)
#include <taskflow/cuda/cuda_memory.hpp>
initializes or sets GPU memory to the given value byte by byte
Parameters | |
---|---|
stream | stream identifier |
devPtr | pointer to GPU memory |
value | value to set for each byte of the specified memory |
count | size in bytes to set |
The method calls cudaMemsetAsync
with the given stream
to fill the first count
bytes of the memory area pointed to by devPtr
with the constant byte value value
.
const char* tf:: to_string(cudaTaskType type) constexpr
#include <taskflow/cuda/cuda_task.hpp>
convert a cuda_task type to a human-readable string
std:: ostream& tf:: operator<<(std:: ostream& os,
const cudaTask& ct)
#include <taskflow/cuda/cuda_task.hpp>
overload of ostream inserter operator for cudaTask
template<typename P, typename C>
void tf:: cuda_single_task(P&& p,
C c)
runs a callable asynchronously using one kernel thread
Template parameters | |
---|---|
P | execution policy type |
C | closure type |
Parameters | |
p | execution policy |
c | closure to run by one kernel thread |
The function launches a single kernel thread to run the given callable through the stream in the execution policy object.
template<typename P, typename I, typename C>
void tf:: cuda_for_each(P&& p,
I first,
I last,
C c)
performs asynchronous parallel iterations over a range of items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
C | unary operator type |
Parameters | |
p | execution policy object |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
c | unary operator to apply to each dereferenced iterator |
This function is equivalent to a parallel execution of the following loop on a GPU:
for(auto itr = first; itr != last; itr++) { c(*itr); }
template<typename P, typename I, typename C>
void tf:: cuda_for_each_index(P&& p,
I first,
I last,
I inc,
C c)
performs asynchronous parallel iterations over an index-based range of items
Template parameters | |
---|---|
P | execution policy type |
I | input index type |
C | unary operator type |
Parameters | |
p | execution policy object |
first | index to the beginning of the range |
last | index to the end of the range |
inc | step size between successive iterations |
c | unary operator to apply to each index |
This function is equivalent to a parallel execution of the following loop on a GPU:
// step is positive [first, last) for(auto i=first; i<last; i+=step) { c(i); } // step is negative [first, last) for(auto i=first; i>last; i+=step) { c(i); }
template<typename P, typename I, typename O, typename C>
void tf:: cuda_transform(P&& p,
I first,
I last,
O output,
C op)
performs asynchronous parallel transforms over a range of items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
O | output iterator type |
C | unary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
output | iterator to the beginning of the output range |
op | unary operator to apply to transform each item |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *output++ = op(*first++); }
template<typename P, typename I1, typename I2, typename O, typename C>
void tf:: cuda_transform(P&& p,
I1 first1,
I1 last1,
I2 first2,
O output,
C op)
performs asynchronous parallel transforms over two ranges of items
Template parameters | |
---|---|
P | execution policy type |
I1 | first input iterator type |
I2 | second input iterator type |
O | output iterator type |
C | binary operator type |
Parameters | |
p | execution policy |
first1 | iterator to the beginning of the first range |
last1 | iterator to the end of the first range |
first2 | iterator to the beginning of the second range |
output | iterator to the beginning of the output range |
op | binary operator to apply to transform each pair of items |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first1 != last1) { *output++ = op(*first1++, *first2++); }
#include <taskflow/cuda/algorithm/reduce.hpp>
template<typename P, typename I, typename T, typename O>
void tf:: cuda_reduce(P&& p,
I first,
I last,
T* res,
O op,
void* buf)
performs asynchronous parallel reduction over a range of items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
T | value type |
O | binary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
res | pointer to the result |
op | binary operator to apply to reduce elements |
buf | pointer to the temporary buffer |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *result = op(*result, *first++); }
#include <taskflow/cuda/algorithm/reduce.hpp>
template<typename P, typename I, typename T, typename O>
void tf:: cuda_uninitialized_reduce(P&& p,
I first,
I last,
T* res,
O op,
void* buf)
performs asynchronous parallel reduction over a range of items without an initial value
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
T | value type |
O | binary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
res | pointer to the result |
op | binary operator to apply to reduce elements |
buf | pointer to the temporary buffer |
This method is equivalent to the parallel execution of the following loop on a GPU:
*result = *first++; // no initial values partitipcate in the loop while (first != last) { *result = op(*result, *first++); }
#include <taskflow/cuda/algorithm/reduce.hpp>
template<typename P, typename I, typename T, typename O, typename U>
void tf:: cuda_transform_reduce(P&& p,
I first,
I last,
T* res,
O bop,
U uop,
void* buf)
performs asynchronous parallel reduction over a range of transformed items without an initial value
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
T | value type |
O | binary operator type |
U | unary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
res | pointer to the result |
bop | binary operator to apply to reduce elements |
uop | unary operator to apply to transform elements |
buf | pointer to the temporary buffer |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *result = bop(*result, uop(*first++)); }
#include <taskflow/cuda/algorithm/reduce.hpp>
template<typename P, typename I, typename T, typename O, typename U>
void tf:: cuda_uninitialized_transform_reduce(P&& p,
I first,
I last,
T* res,
O bop,
U uop,
void* buf)
performs asynchronous parallel reduction over a range of transformed items with an initial value
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
T | value type |
O | binary operator type |
U | unary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
res | pointer to the result |
bop | binary operator to apply to reduce elements |
uop | unary operator to apply to transform elements |
buf | pointer to the temporary buffer |
This method is equivalent to the parallel execution of the following loop on a GPU:
*result = uop(*first++); // no initial values partitipcate in the loop while (first != last) { *result = bop(*result, uop(*first++)); }
#include <taskflow/cuda/algorithm/scan.hpp>
template<typename P, typename I, typename O, typename C>
void tf:: cuda_inclusive_scan(P&& p,
I first,
I last,
O output,
C op,
void* buf)
performs asynchronous inclusive scan over a range of items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator |
O | output iterator |
C | binary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
op | binary operator to apply to scan |
buf | pointer to the temporary buffer |
#include <taskflow/cuda/algorithm/scan.hpp>
template<typename P, typename I, typename O, typename C, typename U>
void tf:: cuda_transform_inclusive_scan(P&& p,
I first,
I last,
O output,
C bop,
U uop,
void* buf)
performs asynchronous inclusive scan over a range of transformed items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator |
O | output iterator |
C | binary operator type |
U | unary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
bop | binary operator to apply to scan |
uop | unary operator to apply to transform each item before scan |
buf | pointer to the temporary buffer |
#include <taskflow/cuda/algorithm/scan.hpp>
template<typename P, typename I, typename O, typename C>
void tf:: cuda_exclusive_scan(P&& p,
I first,
I last,
O output,
C op,
void* buf)
performs asynchronous exclusive scan over a range of items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator |
O | output iterator |
C | binary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
op | binary operator to apply to scan |
buf | pointer to the temporary buffer |
#include <taskflow/cuda/algorithm/scan.hpp>
template<typename P, typename I, typename O, typename C, typename U>
void tf:: cuda_transform_exclusive_scan(P&& p,
I first,
I last,
O output,
C bop,
U uop,
void* buf)
performs asynchronous exclusive scan over a range of items
Template parameters | |
---|---|
P | execution policy type |
I | input iterator |
O | output iterator |
C | binary operator type |
U | unary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
bop | binary operator to apply to scan |
uop | unary operator to apply to transform each item before scan |
buf | pointer to the temporary buffer |
#include <taskflow/cuda/algorithm/merge.hpp>
template<typename P, typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>
void tf:: cuda_merge_by_key(P&& p,
a_keys_it a_keys_first,
a_keys_it a_keys_last,
a_vals_it a_vals_first,
b_keys_it b_keys_first,
b_keys_it b_keys_last,
b_vals_it b_vals_first,
c_keys_it c_keys_first,
c_vals_it c_vals_first,
C comp,
void* buf)
performs asynchronous key-value merge over a range of keys and values
Template parameters | |
---|---|
P | execution policy type |
a_keys_it | first key iterator type |
a_vals_it | first value iterator type |
b_keys_it | second key iterator type |
b_vals_it | second value iterator type |
c_keys_it | output key iterator type |
c_vals_it | output value iterator type |
C | comparator type |
Parameters | |
p | execution policy |
a_keys_first | iterator to the beginning of the first key range |
a_keys_last | iterator to the end of the first key range |
a_vals_first | iterator to the beginning of the first value range |
b_keys_first | iterator to the beginning of the second key range |
b_keys_last | iterator to the end of the second key range |
b_vals_first | iterator to the beginning of the second value range |
c_keys_first | iterator to the beginning of the output key range |
c_vals_first | iterator to the beginning of the output value range |
comp | comparator |
buf | pointer to the temporary buffer |
Performs a key-value merge that copies elements from [a_keys_first, a_keys_last)
and [b_keys_first, b_keys_last)
into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))
such that the resulting range is in ascending key order.
At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first))
and [b_vals_first + (b_keys_last - b_keys_first))
into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))
such that the resulting range is in ascending order implied by each input element's associated key.
For example, assume:
a_keys
= {1, 8};a_vals
= {2, 1};b_keys
= {3, 7};b_vals
= {3, 4};
After the merge, we have:
c_keys
= {1, 3, 7, 8}c_vals
= {2, 3, 4, 1}
#include <taskflow/cuda/algorithm/merge.hpp>
template<typename P, typename a_keys_it, typename b_keys_it, typename c_keys_it, typename C>
void tf:: cuda_merge(P&& p,
a_keys_it a_keys_first,
a_keys_it a_keys_last,
b_keys_it b_keys_first,
b_keys_it b_keys_last,
c_keys_it c_keys_first,
C comp,
void* buf)
performs asynchronous key-only merge over a range of keys
Template parameters | |
---|---|
P | execution policy type |
a_keys_it | first key iterator type |
b_keys_it | second key iterator type |
c_keys_it | output key iterator type |
C | comparator type |
Parameters | |
p | execution policy |
a_keys_first | iterator to the beginning of the first key range |
a_keys_last | iterator to the end of the first key range |
b_keys_first | iterator to the beginning of the second key range |
b_keys_last | iterator to the end of the second key range |
c_keys_first | iterator to the beginning of the output key range |
comp | comparator |
buf | pointer to the temporary buffer |
This function is equivalent to tf::
#include <taskflow/cuda/algorithm/sort.hpp>
template<typename P, typename K, typename V = cudaEmpty>
unsigned tf:: cuda_sort_buffer_size(unsigned count)
queries the buffer size in bytes needed to call sort kernels for the given number of elements
Template parameters | |
---|---|
P | execution policy type |
K | key type |
V | value type (default tf::cudaEmpty) |
Parameters | |
count | number of keys/values to sort |
The function is used to allocate a buffer for calling tf::
#include <taskflow/cuda/algorithm/sort.hpp>
template<typename P, typename K_it, typename V_it, typename C>
void tf:: cuda_sort_by_key(P&& p,
K_it k_first,
K_it k_last,
V_it v_first,
C comp,
void* buf)
performs asynchronous key-value sort on a range of items
Template parameters | |
---|---|
P | execution policy type |
K_it | key iterator type |
V_it | value iterator type |
C | comparator type |
Parameters | |
p | execution policy |
k_first | iterator to the beginning of the key range |
k_last | iterator to the end of the key range |
v_first | iterator to the beginning of the value range |
comp | binary comparator |
buf | pointer to the temporary buffer |
Sorts key-value elements in [k_first, k_last)
and [v_first, v_first + (k_last - k_first))
into ascending key order using the given comparator comp
. If i
and j
are any two valid iterators in [k_first, k_last)
such that i
precedes j
, and p
and q
are iterators in [v_first, v_first + (k_last - k_first))
corresponding to i
and j
respectively, then comp(*j, *i)
evaluates to false
.
For example, assume:
keys
are{1, 4, 2, 8, 5, 7}
values
are{'a', 'b', 'c', 'd', 'e', 'f'}
After sort:
keys
are{1, 2, 4, 5, 7, 8}
values
are{'a', 'c', 'b', 'e', 'f', 'd'}
#include <taskflow/cuda/algorithm/sort.hpp>
template<typename P, typename K_it, typename C>
void tf:: cuda_sort(P&& p,
K_it k_first,
K_it k_last,
C comp,
void* buf)
performs asynchronous key-only sort on a range of items
Template parameters | |
---|---|
P | execution policy type |
K_it | key iterator type |
C | comparator type |
Parameters | |
p | execution policy |
k_first | iterator to the beginning of the key range |
k_last | iterator to the end of the key range |
comp | binary comparator |
buf | pointer to the temporary buffer |
This method is equivalent to tf::
#include <taskflow/cuda/algorithm/find.hpp>
template<typename P, typename I, typename U>
void tf:: cuda_find_if(P&& p,
I first,
I last,
unsigned* idx,
U op)
finds the index of the first element that satisfies the given criteria
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
U | unary operator type |
Parameters | |
p | execution policy |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | pointer to the index of the found element |
op | unary operator which returns true for the required element |
The function launches kernels asynchronously to find the index idx
of the first element in the range [first, last)
such that op(*(first+idx))
is true. This is equivalent to the parallel execution of the following loop:
unsigned idx = 0; for(; first != last; ++first, ++idx) { if (p(*first)) { return idx; } } return idx;
#include <taskflow/cuda/algorithm/find.hpp>
template<typename P, typename I, typename O>
void tf:: cuda_min_element(P&& p,
I first,
I last,
unsigned* idx,
O op,
void* buf)
finds the index of the minimum element in a range
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
O | comparator type |
Parameters | |
p | execution policy object |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | solution index of the minimum element |
op | comparison function object |
buf | pointer to the buffer |
The function launches kernels asynchronously to find the smallest element in the range [first, last)
using the given comparator op
. You need to provide a buffer that holds at least tf::cuda_min_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop:
if(first == last) { return 0; } auto smallest = first; for (++first; first != last; ++first) { if (op(*first, *smallest)) { smallest = first; } } return std::distance(first, smallest);
#include <taskflow/cuda/algorithm/find.hpp>
template<typename P, typename I, typename O>
void tf:: cuda_max_element(P&& p,
I first,
I last,
unsigned* idx,
O op,
void* buf)
finds the index of the maximum element in a range
Template parameters | |
---|---|
P | execution policy type |
I | input iterator type |
O | comparator type |
Parameters | |
p | execution policy object |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | solution index of the maximum element |
op | comparison function object |
buf | pointer to the buffer |
The function launches kernels asynchronously to find the largest element in the range [first, last)
using the given comparator op
. You need to provide a buffer that holds at least tf::cuda_max_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop:
if(first == last) { return 0; } auto largest = first; for (++first; first != last; ++first) { if (op(*largest, *first)) { largest = first; } } return std::distance(first, largest);
const char* tf:: version() constexpr
#include <taskflow/taskflow.hpp>
queries the version information in a string format major.minor.patch
Release notes are available here: https:/
Variable documentation
#include <taskflow/core/graph.hpp>
template<typename P>
bool tf:: is_task_params_v constexpr
determines if the given type is a task parameter type
Task parameters can be specified in one of the following types:
- tf::
TaskParams: assign the struct of defined parameters - tf::
DefaultTaskParams: assign nothing - std::
string: assign a name to the task
std:: array<TaskType, 6> tf:: TASK_TYPES constexpr
#include <taskflow/core/task.hpp>
array of all task types (used for iterating task types)
#include <taskflow/core/task.hpp>
template<typename C>
bool tf:: is_subflow_task_v constexpr
determines if a callable is a dynamic task
A dynamic task is a callable object constructible from std::function<void(Subflow&)>.
#include <taskflow/core/task.hpp>
template<typename C>
bool tf:: is_condition_task_v constexpr
determines if a callable is a condition task
A condition task is a callable object constructible from std::
#include <taskflow/core/task.hpp>
template<typename C>
bool tf:: is_multi_condition_task_v constexpr
determines if a callable is a multi-condition task
A multi-condition task is a callable object constructible from std::
#include <taskflow/core/task.hpp>
template<typename C>
bool tf:: is_static_task_v constexpr
determines if a callable is a static task
A static task is a callable object constructible from std::function<void()> or std::function<void(tf::Runtime&)>.
#include <taskflow/algorithm/partitioner.hpp>
template<typename P>
bool tf:: is_partitioner_v constexpr
determines if a type is a partitioner
A partitioner is a derived type from tf::