vector add

向量加法，跑在 h100 上。

cpp

#include <cuda_runtime.h>

__global__ void vector_add(const float* __restrict__ A,
                           const float* __restrict__ B,
                           float* __restrict__ C,
                           int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    int n4 = N / 4;
    const float4* A4 = reinterpret_cast<const float4*>(A);
    const float4* B4 = reinterpret_cast<const float4*>(B);
    float4* C4 = reinterpret_cast<float4*>(C);

    // Single float4 per iteration; use read-only cache for loads.
    for (int i = idx; i < n4; i += stride) {
        float4 a0 = __ldg(A4 + i);
        float4 b0 = __ldg(B4 + i);
        float4 c0 = {a0.x + b0.x, a0.y + b0.y, a0.z + b0.z, a0.w + b0.w};
        C4[i] = c0;
    }

    // Tail handling for elements not divisible by 4.
    int tail_start = n4 * 4;
    for (int i = tail_start + idx; i < N; i += stride) {
        C[i] = __ldg(A + i) + __ldg(B + i);
    }
}

// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = 256;
    int blocksPerGrid = max(1, (N / 4 + threadsPerBlock - 1) / threadsPerBlock);

    vector_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
    cudaDeviceSynchronize();
}

使用 ldg，在 read only 的这种流式读的情况下它可能比普通情况更好。
每个线程有序的去拿自己的就行了，需要考虑到外面 block 数可能过多或者过少的情况。
直接内存带宽打满就行了。

llama.cpp & ggml

Modern processor design

Computer Architecture:A Quantitative Approach

snippets

memory order

linux kernel

build process and boot

irq

syscall

memory manage

process schedule

io stack

driver

net

TS

python

beginning c++20

现代c++语言核心特性解析

c++新经典

杂记随笔

gdb

Debugging Techniques

xiangshan gem5

base

mem

configs

cpu

pred

minor

o3

sim

arch

generic

vta

dragon book

llvm

vector add

Modern processor design

Computer Architecture:A Quantitative Approach

gdb

pred

minor

o3

generic

vector add ​

vector add