Loading...
Searching...
No Matches
k-means Clustering with CUDA GPU

Following k-means Clustering, we accelerate k-means clustering on a CUDA GPU using tf::cudaGraph. The GPU's massive thread-level parallelism makes the assignment step and centroid update step much faster, achieving over 12× speed-up over the parallel CPU implementation at large problem sizes.

CUDA Kernels

Both the assignment step and the centroid update step are parallelisable at the GPU thread level. We write two CUDA kernels: one that assigns each point to its nearest centroid (one thread per point), and one that recomputes each centroid mean (one thread per centroid):

// assign each point to its nearest centroid
// one thread per point
__global__ void assign_clusters(
float* px, float* py, int N,
float* mx, float* my, float* sx, float* sy, int K, int* c
) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= N) return;
float x = px[i], y = py[i];
float best_d = FLT_MAX;
int best_k = 0;
for(int k = 0; k < K; k++) {
float d = L2(x, y, mx[k], my[k]);
if(d < best_d) { best_d = d; best_k = k; }
}
// use atomics because multiple threads update the same centroid bucket
atomicAdd(&sx[best_k], x);
atomicAdd(&sy[best_k], y);
atomicAdd(&c [best_k], 1);
}
// recompute each centroid as the mean of its assigned points
// one thread per centroid
__global__ void compute_new_means(
float* mx, float* my, float* sx, float* sy, int* c
) {
int k = threadIdx.x;
int count = max(1, c[k]); // guard against empty clusters
mx[k] = sx[k] / count;
my[k] = sy[k] / count;
}

Because multiple GPU threads may update the same centroid bucket in assign_clusters, we use atomic operations (atomicAdd) on sx, sy, and c to avoid data races.

CUDA Graph Task

We build a Taskflow that allocates GPU memory in parallel, copies data to the GPU, runs the CUDA graph for M iterations, copies results back, and frees GPU memory:

void kmeans_gpu(
int N, int K, int M,
const std::vector<float>& px, const std::vector<float>& py
) {
std::vector<float> h_mx(px.begin(), px.begin() + K);
std::vector<float> h_my(py.begin(), py.begin() + K);
float *d_px, *d_py, *d_mx, *d_my, *d_sx, *d_sy;
int *d_c;
tf::Executor executor;
tf::Taskflow taskflow("K-Means GPU");
// allocate GPU memory for all arrays in parallel
auto alloc_px = taskflow.emplace([&]() {
cudaMalloc(&d_px, N * sizeof(float));
}).name("alloc_px");
auto alloc_py = taskflow.emplace([&]() {
cudaMalloc(&d_py, N * sizeof(float));
}).name("alloc_py");
auto alloc_mx = taskflow.emplace([&]() {
cudaMalloc(&d_mx, K * sizeof(float));
}).name("alloc_mx");
auto alloc_my = taskflow.emplace([&]() {
cudaMalloc(&d_my, K * sizeof(float));
}).name("alloc_my");
auto alloc_sx = taskflow.emplace([&]() {
cudaMalloc(&d_sx, K * sizeof(float));
}).name("alloc_sx");
auto alloc_sy = taskflow.emplace([&]() {
cudaMalloc(&d_sy, K * sizeof(float));
}).name("alloc_sy");
auto alloc_c = taskflow.emplace([&]() {
cudaMalloc(&d_c, K * sizeof(int));
}).name("alloc_c");
// copy point data and initial centroids to GPU
auto h2d = taskflow.emplace([&]() {
cudaMemcpy(d_px, px.data(), N * sizeof(float), cudaMemcpyDefault);
cudaMemcpy(d_py, py.data(), N * sizeof(float), cudaMemcpyDefault);
cudaMemcpy(d_mx, h_mx.data(), K * sizeof(float), cudaMemcpyDefault);
cudaMemcpy(d_my, h_my.data(), K * sizeof(float), cudaMemcpyDefault);
}).name("h2d");
// build and run the CUDA graph for M iterations
auto kmeans = taskflow.emplace([&]() {
// zero accumulator arrays before each iteration
auto zero_sx = cg.zero(d_sx, K);
auto zero_sy = cg.zero(d_sy, K);
auto zero_c = cg.zero(d_c, K);
// one thread per point for assignment
auto cluster = cg.kernel(
(N + 511) / 512, 512, 0,
assign_clusters, d_px, d_py, N, d_mx, d_my, d_sx, d_sy, K, d_c
);
// one thread per centroid for update
auto new_means = cg.kernel(
1, K, 0,
compute_new_means, d_mx, d_my, d_sx, d_sy, d_c
);
cluster.succeed(zero_sx, zero_sy, zero_c)
.precede(new_means);
// instantiate and run the CUDA graph M times on a single stream
for(int i = 0; i < M; i++) {
stream.run(exec);
}
stream.synchronize();
}).name("update_means");
// copy final centroids back to host
auto d2h = taskflow.emplace([&]() {
cudaMemcpy(h_mx.data(), d_mx, K * sizeof(float), cudaMemcpyDefault);
cudaMemcpy(h_my.data(), d_my, K * sizeof(float), cudaMemcpyDefault);
}).name("d2h");
// free all GPU memory
auto free_mem = taskflow.emplace([&]() {
cudaFree(d_px); cudaFree(d_py);
cudaFree(d_mx); cudaFree(d_my);
cudaFree(d_sx); cudaFree(d_sy);
cudaFree(d_c);
}).name("free");
// wire dependencies
h2d.succeed(alloc_px, alloc_py, alloc_mx, alloc_my);
kmeans.succeed(alloc_sx, alloc_sy, alloc_c, h2d)
.precede(d2h);
d2h.precede(free_mem);
executor.run(taskflow).wait();
}
class to create an executor
Definition executor.hpp:62
tf::Future< void > run(Taskflow &taskflow)
runs a taskflow once
class to create a taskflow object
Definition taskflow.hpp:64
class to create a CUDA graph with uunique ownership
Definition cuda_graph.hpp:531
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)
creates a kernel task
Definition cuda_graph.hpp:1010
cudaTask zero(T *dst, size_t count)
creates a memset task that sets a typed memory block to zero
Definition cuda_graph.hpp:1039
class to create an executable CUDA graph with unique ownership
Definition cuda_graph_exec.hpp:93
class to create a CUDA stream with unique ownership
Definition cuda_stream.hpp:189
cudaStreamBase & synchronize()
synchronizes the associated stream
Definition cuda_stream.hpp:232
cudaStreamBase & run(const cudaGraphExecBase< C, D > &exec)
runs the given executable CUDA graph
cudaTask & succeed(Ts &&... tasks)
adds precedence links from other tasks to this
Definition cuda_graph.hpp:418
cudaTask & precede(Ts &&... tasks)
adds precedence links from this to other tasks
Definition cuda_graph.hpp:407

The outer Taskflow orchestrates all CPU-side work: allocation tasks run in parallel where possible, the CUDA graph task runs M iterations entirely on the GPU without returning to the CPU between iterations, and finally results are copied back and memory is freed.

The task graph structure is shown below:

Benchmarking

Runtime comparison across three implementations on a 12-core Intel i7-8700 at 3.20 GHz and a Nvidia RTX 2080:

N K M CPU sequential CPU parallel GPU
10 5 10 0.14 ms 77 ms 1 ms
100 10 100 0.56 ms 86 ms 7 ms
1000 10 1000 10 ms 98 ms 55 ms
10000 10 10000 1006 ms 713 ms 458 ms
100000 10 100000 102483 ms 49966 ms 7952 ms

For small problem sizes, kernel launch and data transfer overhead makes the GPU slower than the CPU. At N = 100K, the GPU is over 12× faster than the parallel CPU and over 12× faster than the sequential CPU, demonstrating the value of offloading data-intensive inner loops to the GPU.