Both the assignment step and the centroid update step are parallelisable at the GPU thread level. We write two CUDA kernels: one that assigns each point to its nearest centroid (one thread per point), and one that recomputes each centroid mean (one thread per centroid):
__global__ void assign_clusters(
float* px, float* py, int N,
float* mx, float* my, float* sx, float* sy, int K, int* c
) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= N) return;
float x = px[i], y = py[i];
float best_d = FLT_MAX;
int best_k = 0;
for(int k = 0; k < K; k++) {
float d = L2(x, y, mx[k], my[k]);
if(d < best_d) { best_d = d; best_k = k; }
}
atomicAdd(&sx[best_k], x);
atomicAdd(&sy[best_k], y);
atomicAdd(&c [best_k], 1);
}
__global__ void compute_new_means(
float* mx, float* my, float* sx, float* sy, int* c
) {
int k = threadIdx.x;
int count = max(1, c[k]);
mx[k] = sx[k] / count;
my[k] = sy[k] / count;
}
We build a Taskflow that allocates GPU memory in parallel, copies data to the GPU, runs the CUDA graph for M iterations, copies results back, and frees GPU memory:
void kmeans_gpu(
int N, int K, int M,
const std::vector<float>& px, const std::vector<float>& py
) {
std::vector<float> h_mx(px.begin(), px.begin() + K);
std::vector<float> h_my(py.begin(), py.begin() + K);
float *d_px, *d_py, *d_mx, *d_my, *d_sx, *d_sy;
int *d_c;
auto alloc_px = taskflow.emplace([&]() {
cudaMalloc(&d_px, N * sizeof(float));
}).name("alloc_px");
auto alloc_py = taskflow.emplace([&]() {
cudaMalloc(&d_py, N * sizeof(float));
}).name("alloc_py");
auto alloc_mx = taskflow.emplace([&]() {
cudaMalloc(&d_mx, K * sizeof(float));
}).name("alloc_mx");
auto alloc_my = taskflow.emplace([&]() {
cudaMalloc(&d_my, K * sizeof(float));
}).name("alloc_my");
auto alloc_sx = taskflow.emplace([&]() {
cudaMalloc(&d_sx, K * sizeof(float));
}).name("alloc_sx");
auto alloc_sy = taskflow.emplace([&]() {
cudaMalloc(&d_sy, K * sizeof(float));
}).name("alloc_sy");
auto alloc_c = taskflow.emplace([&]() {
cudaMalloc(&d_c, K * sizeof(int));
}).name("alloc_c");
auto h2d = taskflow.emplace([&]() {
cudaMemcpy(d_px, px.data(), N * sizeof(float), cudaMemcpyDefault);
cudaMemcpy(d_py, py.data(), N * sizeof(float), cudaMemcpyDefault);
cudaMemcpy(d_mx, h_mx.data(), K * sizeof(float), cudaMemcpyDefault);
cudaMemcpy(d_my, h_my.data(), K * sizeof(float), cudaMemcpyDefault);
}).name("h2d");
auto kmeans = taskflow.emplace([&]() {
auto zero_sx = cg.
zero(d_sx, K);
auto zero_sy = cg.
zero(d_sy, K);
auto zero_c = cg.
zero(d_c, K);
(N + 511) / 512, 512, 0,
assign_clusters, d_px, d_py, N, d_mx, d_my, d_sx, d_sy, K, d_c
);
1, K, 0,
compute_new_means, d_mx, d_my, d_sx, d_sy, d_c
);
cluster.
succeed(zero_sx, zero_sy, zero_c)
for(int i = 0; i < M; i++) {
}
}).name("update_means");
auto d2h = taskflow.emplace([&]() {
cudaMemcpy(h_mx.data(), d_mx, K * sizeof(float), cudaMemcpyDefault);
cudaMemcpy(h_my.data(), d_my, K * sizeof(float), cudaMemcpyDefault);
}).name("d2h");
auto free_mem = taskflow.emplace([&]() {
cudaFree(d_px); cudaFree(d_py);
cudaFree(d_mx); cudaFree(d_my);
cudaFree(d_sx); cudaFree(d_sy);
cudaFree(d_c);
}).name("free");
h2d.succeed(alloc_px, alloc_py, alloc_mx, alloc_my);
kmeans.succeed(alloc_sx, alloc_sy, alloc_c, h2d)
.precede(d2h);
d2h.precede(free_mem);
executor.
run(taskflow).wait();
}
class to create an executor
Definition executor.hpp:62
tf::Future< void > run(Taskflow &taskflow)
runs a taskflow once
class to create a taskflow object
Definition taskflow.hpp:64
class to create a CUDA graph with uunique ownership
Definition cuda_graph.hpp:531
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)
creates a kernel task
Definition cuda_graph.hpp:1010
cudaTask zero(T *dst, size_t count)
creates a memset task that sets a typed memory block to zero
Definition cuda_graph.hpp:1039
class to create an executable CUDA graph with unique ownership
Definition cuda_graph_exec.hpp:93
class to create a CUDA stream with unique ownership
Definition cuda_stream.hpp:189
cudaStreamBase & synchronize()
synchronizes the associated stream
Definition cuda_stream.hpp:232
cudaStreamBase & run(const cudaGraphExecBase< C, D > &exec)
runs the given executable CUDA graph
cudaTask & succeed(Ts &&... tasks)
adds precedence links from other tasks to this
Definition cuda_graph.hpp:418
cudaTask & precede(Ts &&... tasks)
adds precedence links from this to other tasks
Definition cuda_graph.hpp:407
The outer Taskflow orchestrates all CPU-side work: allocation tasks run in parallel where possible, the CUDA graph task runs M iterations entirely on the GPU without returning to the CPU between iterations, and finally results are copied back and memory is freed.
For small problem sizes, kernel launch and data transfer overhead makes the GPU slower than the CPU. At N = 100K, the GPU is over 12× faster than the parallel CPU and over 12× faster than the sequential CPU, demonstrating the value of offloading data-intensive inner loops to the GPU.