Skip to content

llama: automatically set runtime parameters such as --n-gpu-layers to fit VRAM #14067

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 69 additions & 2 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -885,9 +885,78 @@ std::string fs_get_cache_file(const std::string & filename) {
// Model utils
//

static void common_fit_to_free_memory(
const std::string & path_model, llama_model_params & mparams, llama_context_params & cparams, const size_t margin) {

std::vector<ggml_backend_dev_t> devices(ggml_backend_dev_count());
for (size_t i = 0; i < devices.size(); i++) {
devices[i] = ggml_backend_dev_get(i);
}

std::vector<size_t> memory_total(devices.size());
std::vector<size_t> memory_free(devices.size());
for (size_t i = 0; i < devices.size(); i++) {
ggml_backend_dev_memory(devices[i], memory_free.data() + i, memory_total.data() + i);
}

auto get_min_margin = [path_model, memory_free](const llama_model_params & mparams_test, const llama_context_params & cparams_test) {
std::vector<size_t> memory_expect(memory_free.size());
GGML_ASSERT(llama_expected_memory_use(path_model.c_str(), mparams_test, cparams_test, memory_expect.data()));

int64_t min_margin = INT64_MAX;
for (size_t i = 0; i < memory_free.size(); i++) {
min_margin = std::min(min_margin, int64_t(memory_free[i]) - int64_t(memory_expect[i]));
}
return min_margin;
};
auto test_ngl = [mparams, cparams, get_min_margin](const int ngl) {
llama_model_params mparams_test = mparams;
mparams_test.n_gpu_layers = ngl;
return get_min_margin(mparams_test, cparams);
};

int ngl_low = 0;
int64_t margin_low = test_ngl(ngl_low);
if (margin_low < int64_t(margin)) {
mparams.n_gpu_layers = ngl_low;
return;
}

int ngl_high = 128; // FIXME
int64_t margin_high = test_ngl(ngl_high);
if (margin_high >= int64_t(margin)) {
mparams.n_gpu_layers = ngl_high;
return;
}

// TODO bisection is ineffient, better to interpolate if max ngl value is known
while (ngl_high - ngl_low > 1) {
const int ngl_test = (ngl_high + ngl_low) / 2;
const int64_t margin_test = test_ngl(ngl_test);

if (margin_test < int64_t(margin)) {
ngl_high = ngl_test;
margin_high = margin_test;
} else {
ngl_low = ngl_test;
margin_low = margin_test;
}
}

if (margin_high >= int64_t(margin)) {
mparams.n_gpu_layers = ngl_high;
} else {
mparams.n_gpu_layers = ngl_low;
}
}

struct common_init_result common_init_from_params(common_params & params) {
common_init_result iparams;
auto mparams = common_model_params_to_llama(params);
auto cparams = common_context_params_to_llama(params);

constexpr size_t margin = 1024*1024*1024;
common_fit_to_free_memory(params.model.path, mparams, cparams, margin);

llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (model == NULL) {
Expand Down Expand Up @@ -925,8 +994,6 @@ struct common_init_result common_init_from_params(common_params & params) {
}
}

auto cparams = common_context_params_to_llama(params);

llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
Expand Down
6 changes: 5 additions & 1 deletion ggml/include/ggml-alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ extern "C" {
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;
typedef struct ggml_backend_device * ggml_backend_dev_t;

// Tensor allocator
struct ggml_tallocr {
Expand Down Expand Up @@ -58,16 +59,19 @@ GGML_API bool ggml_gallocr_reserve_n(
ggml_gallocr_t galloc,
struct ggml_cgraph * graph,
const int * node_buffer_ids,
const int * leaf_buffer_ids);
const int * leaf_buffer_ids,
bool dry_run);

// automatic reallocation if the topology changes when using a single buffer
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);

GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
size_t ggml_gallocr_get_max_size(ggml_gallocr_t galloc, ggml_backend_dev_t dev);

// Utils
// Create a buffer and allocate all the tensors in a ggml_context
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);

Expand Down
1 change: 1 addition & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ extern "C" {
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);

// Initialize backend buffers from a measure graph
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes); // result per backend is written to sizes
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success

GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
Expand Down
60 changes: 50 additions & 10 deletions ggml/src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
}
#endif

// returns the offset for the allocation
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
size = aligned_offset(NULL, size, alloc->alignment);

Expand Down Expand Up @@ -472,7 +473,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
}

static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
return t->data != NULL // tensor data already set externally
|| t->buffer // tensor on external buffer (but may not yet be allocated)
|| ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
}

static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
Expand Down Expand Up @@ -670,7 +673,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
}
}

bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph,
const int * node_buffer_ids, const int * leaf_buffer_ids, bool dry_run) {
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
// add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4;
Expand Down Expand Up @@ -768,7 +772,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
#endif

ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], dry_run ? 0 : new_size);
if (galloc->buffers[i] == NULL) {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false;
Expand All @@ -781,7 +785,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
}

bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL, /*dry_run =*/ false);
}

static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
Expand Down Expand Up @@ -934,6 +938,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
}

size_t ggml_gallocr_get_max_size(ggml_gallocr_t galloc, ggml_backend_dev_t dev) {
for (int i = 0; i < galloc->n_buffers; i++) {
if (ggml_backend_buft_get_device(galloc->bufts[i]) == dev) {
return ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
}
}
return 0;
}

// utils

static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
Expand Down Expand Up @@ -984,14 +997,16 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
return true;
}

ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool dry_run) {
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);

size_t alignment = ggml_backend_buft_get_alignment(buft);
size_t max_size = ggml_backend_buft_get_max_size(buft);

ggml_backend_buffer_t * buffers = NULL;
size_t n_buffers = 0;
*nbytes_total = 0;

size_t cur_buf_size = 0;
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
Expand All @@ -1003,10 +1018,13 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte

if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
// allocate tensors in the current buffer
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
if (!dry_run) {
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
}
}
first = t;
*nbytes_total += cur_buf_size;
cur_buf_size = this_size;
} else {
cur_buf_size += this_size;
Expand All @@ -1015,15 +1033,23 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte

// allocate remaining tensors
if (cur_buf_size > 0) {
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
*nbytes_total += cur_buf_size;
if (!dry_run) {
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
}
}
}

if (dry_run) {
return NULL;
}

if (n_buffers == 0) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
#endif
GGML_ASSERT(!buffers);
return NULL;
}

Expand All @@ -1033,10 +1059,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
} else {
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
}
free(buffers);
if (buffers) {
free(buffers); // can be NULL if dry_run or context is empty
}
return buffer;
}

size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
size_t nbytes_total = 0;
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*dry_run =*/ true);
GGML_ASSERT(!buf);
return nbytes_total;
}

ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
size_t nbytes_total = 0;
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*dry_run =*/ false);
}

ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
}
22 changes: 20 additions & 2 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1347,7 +1347,8 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
#endif
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
ggml_gallocr_reserve_n(sched->galloc, &sched->graph,
sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ false);
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
return false;
Expand Down Expand Up @@ -1546,14 +1547,31 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
sched->is_alloc = false;
}

void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);

ggml_backend_sched_split_graph(sched, measure_graph);

ggml_backend_sched_synchronize(sched);

GGML_ASSERT(ggml_gallocr_reserve_n(sched->galloc, &sched->graph,
sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ true));
for (int ib = 0; ib < sched->n_backends; ib++) {
sizes[ib] = ggml_gallocr_get_max_size(sched->galloc, ggml_backend_get_device(sched->backends[ib]));
}

ggml_backend_sched_reset(sched);
}

bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);

ggml_backend_sched_split_graph(sched, measure_graph);

ggml_backend_sched_synchronize(sched);

if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph,
sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ false)) {
return false;
}

Expand Down
7 changes: 7 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,13 @@ extern "C" {
LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);

// returns success
LLAMA_API bool llama_expected_memory_use(
const char * path_model,
struct llama_model_params mparams,
struct llama_context_params cparams,
size_t * nbytes_expect);

// Initialize the llama + ggml backend
// If numa is true, use NUMA optimizations
// Call once at the start of the program
Expand Down
Loading
Loading