about ggml backend
About backend, memory allocation, compute graph, compute process. all ref the simple-backend.cpp
.
log
Need register log callback, and use macro like GGML_DEBUG
to print log.
device
device means what select decide to run the ggml. device init in the function like ggml_backend_cpu_init
. To init devide, user need do like:
if(backend == cpu){
ggml_backend_cpu_init();
}else if(backend == gpu){
ggml_backend_gpu_init();
}else{
// like this
}
In the cpu:
ggml_backend_t ggml_backend_cpu_init(void) {
// initialize CPU backend now to avoid slowing the first graph computation
// This just allocate FP32 to FP16 Table
ggml_cpu_init();
struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
if (ctx == NULL) {
return NULL;
}
// use how many thread to computer
ctx->n_threads = GGML_DEFAULT_N_THREADS;
// the threadpool
ctx->threadpool = NULL;
// this is a point to buffer, this buffer save the intermedicate
// variable in the computer process.
ctx->work_data = NULL;
// the buffer size
ctx->work_size = 0;
// the callback of compute finish
ctx->abort_callback = NULL;
ctx->abort_callback_data = NULL;
ggml_backend_t cpu_backend = new ggml_backend {
/* .guid = */ ggml_backend_cpu_guid(),
// this point to the cpu interface
/* .interface = */ ggml_backend_cpu_i,
// this point to the cpu device
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ ctx,
};
if (cpu_backend == NULL) {
delete ctx;
return NULL;
}
return cpu_backend;
}
ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0)
this point to the device. device contains all the message about cpu.
static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
/* .get_name = */ ggml_backend_cpu_reg_get_name,
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
/* .get_device = */ ggml_backend_cpu_reg_get_device,
/* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
};
// this function just return the registered device.
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
// init CPU feature detection
ggml_cpu_init();
static struct ggml_backend_reg ggml_backend_cpu_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_cpu_reg_i,
/* .context = */ NULL,
};
return &ggml_backend_cpu_reg;
}
What important in cpu device is ggml_backend_cpu_get_proc_address
, this contains the many extra information about cpu, this function can return the function pointer, user can call this function pointer to get some information.
static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads;
return (void *)fct;
}
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
// this function in cpu is important, the extra matrix extension should use this
ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type;
return (void *)fct;
}
if (strcmp(name, "ggml_backend_get_features") == 0) {
return (void *)ggml_backend_cpu_get_features;
}
if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
return (void *)ggml_backend_cpu_set_abort_callback;
}
if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
return (void *)ggml_numa_init;
}
if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
return (void *)ggml_is_numa;
}
// threadpool - TODO: move to ggml-base
if (strcmp(name, "ggml_threadpool_new") == 0) {
return (void *)ggml_threadpool_new;
}
if (strcmp(name, "ggml_threadpool_free") == 0) {
return (void *)ggml_threadpool_free;
}
if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
return (void *)ggml_backend_cpu_set_threadpool;
}
return NULL;
GGML_UNUSED(reg);
}
all of that is about devide init.
global ggml context
after the device registered, we should create global ggml context.
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
// create context
model.ctx = ggml_init(params);
ggml context don't have specific meaning in ggml, it just use to allocate some memory, so the ggml init will be call in some memory allocation case. The mem_size means the size the memory allocate want, no_alloc true means backend devide allocate the real buffer, false means ggml allocate real buffer.
new tensor object
// create tensors
model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);
this just struct to save some config about tensor, no memory allocate.
tensor memory allocate
// create a backend buffer (backend memory) and alloc the tensors from the context
model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend);
this will call:
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
}
the ggml_backend_buffer_t
meneage the backend device memory. Three have ggml_backend_get_default_buffer_type
to determine with backend buffer type want to use. The most important is, if you want to use intel-amx and other cpu specific matrix extensions, you must let ggml_backend_get_default_buffer_type
to be the extension specific buffer.
ggml_backend_get_default_buffer_type
this use device interface to return the buffer type:
struct ggml_backend_buffer_type_i {
const char * (*get_name) (ggml_backend_buffer_type_t buft);
// allocate a buffer of this type
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
// tensor alignment
size_t (*get_alignment) (ggml_backend_buffer_type_t buft);
// (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
bool (*is_host) (ggml_backend_buffer_type_t buft);
};
struct ggml_backend_buffer_type {
struct ggml_backend_buffer_type_i iface;
ggml_backend_dev_t device;
void * context;
};
the ggml_backend_buffer_type_i seems like a backend buffer allocator, the memory allocate by ggml_backend_buffer_type_i, and operate in the ggml buffer type. The cpu extension implementer must care about this.
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
size_t alignment = ggml_backend_buft_get_alignment(buft);
// see this, the max_size of the backend buffer defined by backend
size_t max_size = ggml_backend_buft_get_max_size(buft);
ggml_backend_buffer_t * buffers = NULL;
size_t n_buffers = 0;
size_t cur_buf_size = 0;
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
// access all the tensors in ggml context
// it seems like the same ggml context can only do that once
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
size_t this_size = 0;
// calculate this tensor need how many size
if (t->data == NULL && t->view_src == NULL) {
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
}
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
// per buffer size need < max buffer size
// allocate tensors in the current buffer
// the alloc_tensor_range allocate the memory
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
}
first = t;
cur_buf_size = this_size;
} else {
cur_buf_size += this_size;
}
}
// allocate remaining tensors
if (cur_buf_size > 0) {
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
}
}
if (n_buffers == 0) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}
ggml_backend_buffer_t buffer;
if (n_buffers == 1) {
buffer = buffers[0];
} else {
// n_buffers copy to ctx
// this also copy
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
}
free(buffers);
return buffer;
}
alloc_tensor_range
static bool alloc_tensor_range(struct ggml_context * ctx,
struct ggml_tensor * first, struct ggml_tensor * last,
ggml_backend_buffer_type_t buft, size_t size,
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
// the allocate backend buffer in there
// use the buffer type method to allocate buffer
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
if (buffer == NULL) {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
free_buffers(buffers, n_buffers);
return false;
}
// add buffer to buffers array
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
(*buffers)[(*n_buffers)++] = buffer;
// simplely thinking, tallocr use ggml_backend_buffer_t to allocate memory for tensor
struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
enum ggml_status status = GGML_STATUS_SUCCESS;
if (t->data == NULL) {
if (t->view_src == NULL) {
// no src, true allocate
status = ggml_tallocr_alloc(&tallocr, t);
} else if (t->buffer == NULL) {
// have src, just a view
status = ggml_backend_view_init(t);
}
} else {
if (t->view_src != NULL && t->buffer == NULL) {
// view of a pre-allocated tensor
status = ggml_backend_view_init(t);
}
}
if (status != GGML_STATUS_SUCCESS) {
GGML_LOG_ERROR("%s: failed to initialize tensor %s\n", __func__, t->name);
free_buffers(buffers, n_buffers);
return false;
}
}
return true;
}
also should care about that, both new and view will all call the init interface in ggml_backend_buffer_t.
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context));
ctx->n_buffers = n_buffers;
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
GGML_ASSERT(ctx->buffers != NULL);
size_t total_size = 0;
for (size_t i = 0; i < n_buffers; i++) {
ctx->buffers[i] = buffers[i];
total_size += ggml_backend_buffer_get_size(buffers[i]);
}
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size);
}
if more than one backend buffer use, thay will be save in ggml_backend_multi_buffer_context, return the ggml_backend_buffer_t, this ggml_backend_buffer_t merge this message.
copy tensor to backend
// load data from cpu memory to backend buffer
ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a));
ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));
this copy the data from host to backend buffer, this also will call the ggml_backend_buffer_t set tensor function.
create graph
// calculate the temporaly memory required to compute
// ggml_gallocr_t is a data struct contains some allocator and graph message
// Gragh allocator
ggml_gallocr_t allocr = NULL;
{
// use glloc to manage buffer
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
// create the worst case graph for memory usage estimation
struct ggml_cgraph * gf = build_graph(model);
// allocate memory
ggml_gallocr_reserve(allocr, gf);
size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
}
build graph process
struct ggml_cgraph * build_graph(const simple_model& model) {
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// create a temporally context to build the graph
struct ggml_context * ctx0 = ggml_init(params0);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
// result = a*b^T
struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b);
// build operations nodes
ggml_build_forward_expand(gf, result);
// delete the temporally context used to build the graph
ggml_free(ctx0);
return gf;
}
we can see that ggml_context was create again, but it just a temp variable. ggml_context does not have specical meaning in ggml, if you want allocate some memory for data struct, you can create ggml context.
build_graph use ggml_new_graph, first looking for the ggml_cgraph struct:
struct ggml_cgraph {
int size; // maximum number of nodes/leafs/grads/grad_accs
// this is not memory size, just the max count of node
int n_nodes; // number of nodes currently in use
int n_leafs; // number of leafs currently in use
struct ggml_tensor ** nodes; // tensors with data that can change if the graph is evaluated
struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes, use for train?
struct ggml_tensor ** grad_accs; // accumulators for node gradients, use for train?
struct ggml_tensor ** leafs; // tensors with constant data
struct ggml_hash_set visited_hash_set;
enum ggml_cgraph_eval_order order; // define the computer order
};
struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
}
struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
const size_t obj_size = ggml_graph_nbytes(size, grads);
// this allocate the memory for the compute graph from ggml_context
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
// the size of the hash table is doubled since it needs to hold both nodes and leafs
size_t hash_size = ggml_hash_size(size * 2);
// move the p to the memory begin
void * p = cgraph + 1;
// set these pointer, p will move in the process
struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
// check that we allocated the correct amount of memory
assert(obj_size == (size_t)((char *)p - (char *)cgraph));
*cgraph = (struct ggml_cgraph) {
/*.size =*/ size,
/*.n_nodes =*/ 0,
/*.n_leafs =*/ 0,
/*.nodes =*/ nodes_ptr,
/*.grads =*/ grads_ptr,
/*.grad_accs =*/ grad_accs_ptr,
/*.leafs =*/ leafs_ptr,
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
};
// just memset the hash_table
ggml_hash_set_reset(&cgraph->visited_hash_set);
if (grads) {
memset(cgraph->grads, 0, hash_size*sizeof(struct ggml_tensor *));
memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *));
}
return cgraph;
}
after create, add the compute to the graph:
// result = a*b^T
struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b);
this just create the tensor, set tensors src then return the tensor.
After that, add tensor to the graph:
// build operations nodes
ggml_build_forward_expand(gf, result);
void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
ggml_build_forward_impl(cgraph, tensor, true);
}
static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
if (!expand) {
// TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
ggml_graph_clear(cgraph);
}
const int n0 = cgraph->n_nodes;
// visit graph, and add
ggml_visit_parents(cgraph, tensor);
const int n_new = cgraph->n_nodes - n0;
GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
if (n_new > 0) {
// the last added node should always be starting point
GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
}
}
static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
// check if already visited
// already visited will be set in hash_table
if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) {
return;
}
for (int i = 0; i < GGML_MAX_SRC; ++i) {
const int k =
(cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
(cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
/* unknown order, just fall back to using i*/ i;
if (node->src[k]) {
// recur visit parent to genertee all parent in the graph
ggml_visit_parents(cgraph, node->src[k]);
}
}
// add as leaf
if (node->op == GGML_OP_NONE && !(node->flags & GGML_TENSOR_FLAG_PARAM)) {
// reached a leaf node, not part of the gradient graph (e.g. a constant)
GGML_ASSERT(cgraph->n_leafs < cgraph->size);
if (strlen(node->name) == 0) {
ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
}
cgraph->leafs[cgraph->n_leafs] = node;
cgraph->n_leafs++;
// add as node
} else {
GGML_ASSERT(cgraph->n_nodes < cgraph->size);
if (strlen(node->name) == 0) {
ggml_format_name(node, "node_%d", cgraph->n_nodes);
}
cgraph->nodes[cgraph->n_nodes] = node;
cgraph->n_nodes++;
}
}
then ggml_gallocr_reserve:
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
// this seems like create hash_table for quick find node
// the hash_table in cgraph use to check if the node in the cgraph
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
// add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4;
// initialize hash table
if (galloc->hash_set.size < min_hash_size) {
ggml_hash_set_free(&galloc->hash_set);
galloc->hash_set = ggml_hash_set_new(min_hash_size);
GGML_ASSERT(galloc->hash_set.keys != NULL);
free(galloc->hash_values);
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
GGML_ASSERT(galloc->hash_values != NULL);
}
// reset ggml_dyn_tallocr
for (int i = 0; i < galloc->n_buffers; i++) {
ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
}
// allocate in hash table
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
// set the node_allocs from the hash table
if (galloc->n_nodes < graph->n_nodes) {
free(galloc->node_allocs);
galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
GGML_ASSERT(galloc->node_allocs != NULL);
}
galloc->n_nodes = graph->n_nodes;
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
struct node_alloc * node_alloc = &galloc->node_allocs[i];
if (node->view_src || node->data) {
node_alloc->dst.buffer_id = -1;
node_alloc->dst.offset = SIZE_MAX;
node_alloc->dst.size_max = 0;
} else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
node_alloc->dst.buffer_id = hn->buffer_id;
node_alloc->dst.offset = hn->offset;
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (!src || src->view_src || src->data) {
node_alloc->src[j].buffer_id = -1;
node_alloc->src[j].offset = SIZE_MAX;
node_alloc->src[j].size_max = 0;
} else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
node_alloc->src[j].buffer_id = hn->buffer_id;
node_alloc->src[j].offset = hn->offset;
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
}
}
}
if (galloc->n_leafs < graph->n_leafs) {
free(galloc->leaf_allocs);
galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
GGML_ASSERT(galloc->leaf_allocs != NULL);
}
galloc->n_leafs = graph->n_leafs;
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
if (leaf->view_src || leaf->data) {
galloc->leaf_allocs[i].leaf.buffer_id = -1;
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
galloc->leaf_allocs[i].leaf.size_max = 0;
} else {
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
galloc->leaf_allocs[i].leaf.offset = hn->offset;
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
}
}
// reallocate buffers if needed
// after before are calculate size, in this are allocate
for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer
for (int j = 0; j < i; j++) {
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
galloc->buffers[i] = galloc->buffers[j];
break;
}
}
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
if (new_size > cur_size || galloc->buffers[i] == NULL) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif
ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
if (galloc->buffers[i] == NULL) {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false;
}
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
}
}
return true;
}
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
// clear hash tables
ggml_hash_set_reset(&galloc->hash_set);
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
// allocate leafs
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
// I cant see any allocate from ggml_backend_buffer_type, this means before build the graph, leaf node should allocate memory itself
// this just count some message about leaf in galloc and ggml_dyn_tallocr
ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
}
// count number of children and views
// allocate other graph inputs and leafs first to avoid overwriting them
// count the child and other message, make efficient
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
// TODO: better way to add external dependencies
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
// itself is never used and should not be considered a dependency
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
struct ggml_tensor * view_src = node->view_src;
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
}
if (node->flags & GGML_TENSOR_FLAG_INPUT) {
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
// allocate explicit inputs
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
}
}
}
// allocate tensors
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
int buffer_id = get_node_buffer_id(node_buffer_ids, i);
// allocate parents (only leafs need to be allocated at this point)
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
continue;
}
ggml_gallocr_allocate_node(galloc, parent, buffer_id);
}
// allocate node
// memory allocate by buffer_id?
ggml_gallocr_allocate_node(galloc, node, buffer_id);
AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
continue;
}
AT_PRINTF("%s", parent->name);
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
AT_PRINTF(", ");
}
}
AT_PRINTF("\n");
// update parents
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j];
if (parent == NULL) {
continue;
}
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
p_hn->n_children -= 1;
AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
if (ggml_is_view(parent)) {
struct ggml_tensor * view_src = parent->view_src;
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
view_src_hn->n_views -= 1;
AT_PRINTF("view_src %s: %d children, %d views\n",
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
ggml_gallocr_free_node(galloc, view_src);
}
}
else if (p_hn->allocated) {
ggml_gallocr_free_node(galloc, parent);
}
}
AT_PRINTF("\n");
}
}
}
compute
struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) {
// reset the allocator to free all the memory allocated during the previous inference
struct ggml_cgraph * gf = build_graph(model);
// allocate tensors
// connect buffer with tensor
ggml_gallocr_alloc_graph(allocr, gf);
int n_threads = 1; // number of threads to perform some operations with multi-threading
if (ggml_backend_is_cpu(model.backend)) {
// this is user set n_threads to computer
// set the how many threads use in cpu
// just set cpucontext n_threads
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
}
ggml_backend_graph_compute(model.backend, gf);
// in this case, the output tensor is the last one in the graph
// return the result
return ggml_graph_node(gf, -1);
}
computer the graph:
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
// do computer
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
// if hava sync callback, sync
ggml_backend_synchronize(backend);
return err;
}
in the cpu, the graph compute is:
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
// create compute plan, how many intermedicate memory should be use?
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
// if the cpu_ctx work_size < plan work_size, realloc
if (cpu_ctx->work_size < cplan.work_size) {
delete[] cpu_ctx->work_data;
// realloc memory
cpu_ctx->work_data = new uint8_t[cplan.work_size];
if (cpu_ctx->work_data == NULL) {
cpu_ctx->work_size = 0;
return GGML_STATUS_ALLOC_FAILED;
}
cpu_ctx->work_size = cplan.work_size;
}
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
// set the abort callback
cplan.abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
return ggml_graph_compute(cgraph, &cplan);
}
struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads,
struct ggml_threadpool * threadpool) {
if (threadpool == NULL) {
//GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
}
if (n_threads <= 0) {
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
}
size_t work_size = 0;
// plan is a buffer use for data
struct ggml_cplan cplan;
memset(&cplan, 0, sizeof(struct ggml_cplan));
int max_tasks = 1;
// thread scheduling for the different operations + work buffer size estimation
// iter node to calculate the intermedicate buffer size and task size
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
// return this node op need how many thread
const int n_tasks = ggml_get_n_tasks(node, n_threads);
max_tasks = MAX(max_tasks, n_tasks);
size_t cur = 0;
// may need more woek size?
// is have extra, this function will change cur
// return the size caculate should use
if (!ggml_cpu_extra_work_size(n_threads, node, &cur)) {
switch (node->op) {
// ...
case GGML_OP_ADD:
case GGML_OP_ADD1:
{
if (ggml_is_quantized(node->src[0]->type)) {
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
}
} break;
case GGML_OP_MUL_MAT:
{
const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
if (node->src[1]->type != vec_dot_type) {
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
}
} break;
// ...
default:
break;
}
}
// need how many memory
work_size = MAX(work_size, cur);
}
// per thread have cache line
if (work_size > 0) {
work_size += CACHE_LINE_SIZE*(n_threads);
}
cplan.threadpool = threadpool;
cplan.n_threads = MIN(max_tasks, n_threads);
// intermedicate data size, max of all task
cplan.work_size = work_size;
// work_data wait allocate by buffer
cplan.work_data = NULL;
return cplan;
}
all is ready, do the graph computer
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
ggml_cpu_init();
GGML_ASSERT(cplan);
GGML_ASSERT(cplan->n_threads > 0);
GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
int n_threads = cplan->n_threads;
struct ggml_threadpool * threadpool = cplan->threadpool;
bool disposable_threadpool = false;
// if threadpool is null, allocate a new threadpool
if (threadpool == NULL) {
//GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
disposable_threadpool = true;
struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
} else {
// Reset some of the parameters that need resetting
// No worker threads should be accessing the parameters below at this stage
threadpool->cgraph = cgraph;
threadpool->cplan = cplan;
threadpool->current_chunk = 0;
threadpool->abort = -1;
threadpool->ec = GGML_STATUS_SUCCESS;
}
#ifdef GGML_USE_OPENMP
if (n_threads > 1) {
#pragma omp parallel num_threads(n_threads)
{
#pragma omp single
{
// update the number of threads from the actual number of threads that we got from OpenMP
n_threads = omp_get_num_threads();
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
}
// openmp will unroll
// use openmp to unroll
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
}
} else {
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
ggml_graph_compute_thread(&threadpool->workers[0]);
}
#else
if (n_threads > threadpool->n_threads_max) {
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
n_threads = threadpool->n_threads_max;
}
// Kick all threads to start the new graph
ggml_graph_compute_kickoff(threadpool, n_threads);
// This is a work thread too
ggml_graph_compute_thread(&threadpool->workers[0]);
#endif
// don't leave affinity set on the main thread
clear_numa_thread_affinity();
enum ggml_status ret = threadpool->ec;
if (disposable_threadpool) {
ggml_threadpool_free(threadpool);
}
return ret;
}
the important function is ggml_graph_compute_thread
static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
struct ggml_threadpool * tp = state->threadpool;
const struct ggml_cgraph * cgraph = tp->cgraph;
const struct ggml_cplan * cplan = tp->cplan;
set_numa_thread_affinity(state->ith);
struct ggml_compute_params params = {
/*.ith =*/ state->ith, // ith means cur threads id
/*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
/*.wsize =*/ cplan->work_size,
/*.wdata =*/ cplan->work_data,
/*.threadpool=*/ tp,
};
// thread parallel in node level
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n];
// per thread do own work
ggml_compute_forward(¶ms, node);
if (state->ith == 0 && cplan->abort_callback &&
cplan->abort_callback(cplan->abort_callback_data)) {
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
tp->ec = GGML_STATUS_ABORTED;
}
// wait for other thread
if (node_n + 1 < cgraph->n_nodes) {
ggml_barrier(state->threadpool);
}
// all thread finish goto next node
}
ggml_barrier(state->threadpool);
return 0;
}
the ggml_compute_forward is real computer function:
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
GGML_ASSERT(params);
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
return;
}
// extra_buffer use to cpu extension
if (ggml_cpu_extra_compute_forward(params, tensor)) {
return;
}
// this is the default implement
switch (tensor->op) {
// ...
case GGML_OP_MUL_MAT:
{
ggml_compute_forward_mul_mat(params, tensor);
} break;
case GGML_OP_MUL_MAT_ID:
{
ggml_compute_forward_mul_mat_id(params, tensor);
} break;
// ...
case GGML_OP_COUNT:
{
GGML_ABORT("fatal error");
}
}
}
after that, the compute is finish, the result is in the node wait be use.
extra buffer
extra buffer use for cpu extension like intel amx and other.
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
if (extra && extra->context) {
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
auto tensor_traits = buf_extra->get_tensor_traits(op);
if (tensor_traits && tensor_traits->compute_forward(params, op)) {
return true;
}
}
}
return false;
}
this check is any cpu extension can accelerate the computer, such as gemm, these can be use DSA to computer.