vector add
向量加法,跑在 h100 上。
cpp
#include <cuda_runtime.h>
__global__ void vector_add(const float* __restrict__ A,
const float* __restrict__ B,
float* __restrict__ C,
int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int n4 = N / 4;
const float4* A4 = reinterpret_cast<const float4*>(A);
const float4* B4 = reinterpret_cast<const float4*>(B);
float4* C4 = reinterpret_cast<float4*>(C);
// Single float4 per iteration; use read-only cache for loads.
for (int i = idx; i < n4; i += stride) {
float4 a0 = __ldg(A4 + i);
float4 b0 = __ldg(B4 + i);
float4 c0 = {a0.x + b0.x, a0.y + b0.y, a0.z + b0.z, a0.w + b0.w};
C4[i] = c0;
}
// Tail handling for elements not divisible by 4.
int tail_start = n4 * 4;
for (int i = tail_start + idx; i < N; i += stride) {
C[i] = __ldg(A + i) + __ldg(B + i);
}
}
// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
int threadsPerBlock = 256;
int blocksPerGrid = max(1, (N / 4 + threadsPerBlock - 1) / threadsPerBlock);
vector_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
cudaDeviceSynchronize();
}- 使用 ldg,在 read only 的这种流式读的情况下它可能比普通情况更好。
- 每个线程有序的去拿自己的就行了,需要考虑到外面 block 数可能过多或者过少的情况。
- 直接内存带宽打满就行了。