Both the assignment step and the centroid update step are parallelisable at the GPU thread level. We write two CUDA kernels: one that assigns each point to its nearest centroid (one thread per point), and one that recomputes each centroid mean (one thread per centroid):
__global__ void assign_clusters(
float* px, float* py, int N,
float* mx, float* my, float* sx, float* sy, int K, int* c
) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= N) return;
float x = px[i], y = py[i];
float best_d = FLT_MAX;
int best_k = 0;
for(int k = 0; k < K; k++) {
float d = L2(x, y, mx[k], my[k]);
if(d < best_d) { best_d = d; best_k = k; }
}
atomicAdd(&sx[best_k], x);
atomicAdd(&sy[best_k], y);
atomicAdd(&c [best_k], 1);
}
__global__ void compute_new_means(
float* mx, float* my, float* sx, float* sy, int* c
) {
int k = threadIdx.x;
int count = max(1, c[k]);
mx[k] = sx[k] / count;
my[k] = sy[k] / count;
}
We build a Taskflow that allocates GPU memory in parallel, copies data to the GPU, runs the CUDA graph for M iterations, copies results back, and frees GPU memory:
void kmeans_gpu(
int N, int K, int M,
const std::vector<float>& px, const std::vector<float>& py
) {
std::vector<float> h_mx(px.begin(), px.begin() + K);
std::vector<float> h_my(py.begin(), py.begin() + K);
float *d_px, *d_py, *d_mx, *d_my, *d_sx, *d_sy;
int *d_c;
auto alloc_px = taskflow.emplace([&]() {
cudaMalloc(&d_px, N * sizeof(float));
}).name("alloc_px");
auto alloc_py = taskflow.emplace([&]() {
cudaMalloc(&d_py, N * sizeof(float));
}).name("alloc_py");
auto alloc_mx = taskflow.emplace([&]() {
cudaMalloc(&d_mx, K * sizeof(float));
}).name("alloc_mx");
auto alloc_my = taskflow.emplace([&]() {
cudaMalloc(&d_my, K * sizeof(float));
}).name("alloc_my");
auto alloc_sx = taskflow.emplace([&]() {
cudaMalloc(&d_sx, K * sizeof(float));
}).name("alloc_sx");
auto alloc_sy = taskflow.emplace([&]() {
cudaMalloc(&d_sy, K * sizeof(float));
}).name("alloc_sy");
auto alloc_c = taskflow.emplace([&]() {
cudaMalloc(&d_c, K * sizeof(int));
}).name("alloc_c");
auto h2d = taskflow.emplace([&]() {
cudaMemcpy(d_px, px.data(), N * sizeof(float), cudaMemcpyDefault);
cudaMemcpy(d_py, py.data(), N * sizeof(float), cudaMemcpyDefault);
cudaMemcpy(d_mx, h_mx.data(), K * sizeof(float), cudaMemcpyDefault);
cudaMemcpy(d_my, h_my.data(), K * sizeof(float), cudaMemcpyDefault);
}).name("h2d");
auto kmeans = taskflow.emplace([&]() {
auto zero_sx = cg.
zero(d_sx, K);
auto zero_sy = cg.
zero(d_sy, K);
auto zero_c = cg.
zero(d_c, K);
(N + 511) / 512, 512, 0,
assign_clusters, d_px, d_py, N, d_mx, d_my, d_sx, d_sy, K, d_c
);
1, K, 0,
compute_new_means, d_mx, d_my, d_sx, d_sy, d_c
);
cluster.
succeed(zero_sx, zero_sy, zero_c)
for(int i = 0; i < M; i++) {
}
}).name("update_means");
auto d2h = taskflow.emplace([&]() {
cudaMemcpy(h_mx.data(), d_mx, K * sizeof(float), cudaMemcpyDefault);
cudaMemcpy(h_my.data(), d_my, K * sizeof(float), cudaMemcpyDefault);
}).name("d2h");
auto free_mem = taskflow.emplace([&]() {
cudaFree(d_px); cudaFree(d_py);
cudaFree(d_mx); cudaFree(d_my);
cudaFree(d_sx); cudaFree(d_sy);
cudaFree(d_c);
}).name("free");
h2d.succeed(alloc_px, alloc_py, alloc_mx, alloc_my);
kmeans.succeed(alloc_sx, alloc_sy, alloc_c, h2d)
.precede(d2h);
d2h.precede(free_mem);
executor.
run(taskflow).wait();
}
class to create an executor
Definition executor.hpp:62
tf::Future< void > run(Taskflow &taskflow)
runs a taskflow once
class to create a taskflow object
Definition taskflow.hpp:64
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)
creates a kernel task
Definition cuda_graph.hpp:1010
cudaTask zero(T *dst, size_t count)
creates a memset task that sets a typed memory block to zero
Definition cuda_graph.hpp:1039
cudaStreamBase & synchronize()
synchronizes the associated stream
Definition cuda_stream.hpp:232
cudaStreamBase & run(const cudaGraphExecBase< C, D > &exec)
runs the given executable CUDA graph
cudaTask & succeed(Ts &&... tasks)
adds precedence links from other tasks to this
Definition cuda_graph.hpp:418
cudaTask & precede(Ts &&... tasks)
adds precedence links from this to other tasks
Definition cuda_graph.hpp:407
cudaGraphExecBase< cudaGraphExecCreator, cudaGraphExecDeleter > cudaGraphExec
default smart pointer type to manage a cudaGraphExec_t object with unique ownership
Definition cudaflow.hpp:23
cudaGraphBase< cudaGraphCreator, cudaGraphDeleter > cudaGraph
default smart pointer type to manage a cudaGraph_t object with unique ownership
Definition cudaflow.hpp:18
cudaStreamBase< cudaStreamCreator, cudaStreamDeleter > cudaStream
default smart pointer type to manage a cudaStream_t object with unique ownership
Definition cuda_stream.hpp:340
The outer Taskflow orchestrates all CPU-side work: allocation tasks run in parallel where possible, the CUDA graph task runs M iterations entirely on the GPU without returning to the CPU between iterations, and finally results are copied back and memory is freed.
For small problem sizes, kernel launch and data transfer overhead makes the GPU slower than the CPU. At N = 100K, the GPU is over 12× faster than the parallel CPU and over 12× faster than the sequential CPU, demonstrating the value of offloading data-intensive inner loops to the GPU.