ggml-org · JohannesGaessler · May 27, 2025
diff --git a/common/common.cpp b/common/common.cpp
@@ -885,9 +885,78 @@ std::string fs_get_cache_file(const std::string & filename) {
 // Model utils
 //
 
+static void common_fit_to_free_memory(
+        const std::string & path_model, llama_model_params & mparams, llama_context_params & cparams, const size_t margin) {
+
+    std::vector<ggml_backend_dev_t> devices(ggml_backend_dev_count());
+    for (size_t i = 0; i < devices.size(); i++) {
+        devices[i] = ggml_backend_dev_get(i);
+    }
+
+    std::vector<size_t> memory_total(devices.size());
+    std::vector<size_t> memory_free(devices.size());
+    for (size_t i = 0; i < devices.size(); i++) {
+        ggml_backend_dev_memory(devices[i], memory_free.data() + i, memory_total.data() + i);
+    }
+
+    auto get_min_margin = [path_model, memory_free](const llama_model_params & mparams_test, const llama_context_params & cparams_test) {
+        std::vector<size_t> memory_expect(memory_free.size());
+        GGML_ASSERT(llama_expected_memory_use(path_model.c_str(), mparams_test, cparams_test, memory_expect.data()));
+
+        int64_t min_margin = INT64_MAX;
+        for (size_t i = 0; i < memory_free.size(); i++) {
+            min_margin = std::min(min_margin, int64_t(memory_free[i]) - int64_t(memory_expect[i]));
+        }
+        return min_margin;
+    };
+    auto test_ngl = [mparams, cparams, get_min_margin](const int ngl) {
+        llama_model_params mparams_test = mparams;
+        mparams_test.n_gpu_layers = ngl;
+        return get_min_margin(mparams_test, cparams);
+    };
+
+    int ngl_low = 0;
+    int64_t margin_low = test_ngl(ngl_low);
+    if (margin_low < int64_t(margin)) {
+        mparams.n_gpu_layers = ngl_low;
+        return;
+    }
+
+    int ngl_high = 128; // FIXME
+    int64_t margin_high = test_ngl(ngl_high);
+    if (margin_high >= int64_t(margin)) {
+        mparams.n_gpu_layers = ngl_high;
+        return;
+    }
+
+    // TODO bisection is ineffient, better to interpolate if max ngl value is known
+    while (ngl_high - ngl_low > 1) {
+        const int ngl_test = (ngl_high + ngl_low) / 2;
+        const int64_t margin_test = test_ngl(ngl_test);
+
+        if (margin_test < int64_t(margin)) {
+            ngl_high = ngl_test;
+            margin_high = margin_test;
+        } else {
+            ngl_low = ngl_test;
+            margin_low = margin_test;
+        }
+    }
+
+    if (margin_high >= int64_t(margin)) {
+        mparams.n_gpu_layers = ngl_high;
+    } else {
+        mparams.n_gpu_layers = ngl_low;
+    }
+}
+
 struct common_init_result common_init_from_params(common_params & params) {
     common_init_result iparams;
     auto mparams = common_model_params_to_llama(params);
+    auto cparams = common_context_params_to_llama(params);
+
+    constexpr size_t margin = 1024*1024*1024;
+    common_fit_to_free_memory(params.model.path, mparams, cparams, margin);
 
     llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
     if (model == NULL) {
@@ -925,8 +994,6 @@ struct common_init_result common_init_from_params(common_params & params) {
         }
     }
 
-    auto cparams = common_context_params_to_llama(params);
-
     llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {
         LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());

diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
@@ -9,6 +9,7 @@ extern "C" {
 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
 typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
 typedef struct             ggml_backend * ggml_backend_t;
+typedef struct      ggml_backend_device * ggml_backend_dev_t;
 
 // Tensor allocator
 struct ggml_tallocr {
@@ -58,16 +59,19 @@ GGML_API bool ggml_gallocr_reserve_n(
     ggml_gallocr_t galloc,
     struct ggml_cgraph * graph,
     const int * node_buffer_ids,
-    const int * leaf_buffer_ids);
+    const int * leaf_buffer_ids,
+    bool dry_run);
 
 // automatic reallocation if the topology changes when using a single buffer
 // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
 GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
 
 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+size_t ggml_gallocr_get_max_size(ggml_gallocr_t galloc, ggml_backend_dev_t dev);
 
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
+GGML_API size_t                       ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
 

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
@@ -293,6 +293,7 @@ extern "C" {
     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
 
     // Initialize backend buffers from a measure graph
+    GGML_API void                 ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes); // result per backend is written to sizes
     GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
 
     GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
@@ -150,6 +150,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
 }
 #endif
 
+// returns the offset for the allocation
 static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
     size = aligned_offset(NULL, size, alloc->alignment);
 
@@ -472,7 +473,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
 }
 
 static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
-    return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
+    return t->data != NULL // tensor data already set externally
+        || t->buffer // tensor on external buffer (but may not yet be allocated)
+        || ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
 }
 
 static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
@@ -670,7 +673,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
     }
 }
 
-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph,
+        const int * node_buffer_ids, const int * leaf_buffer_ids, bool dry_run) {
     size_t min_hash_size = graph->n_nodes + graph->n_leafs;
     // add 25% margin to avoid hash collisions
     min_hash_size += min_hash_size / 4;
@@ -768,7 +772,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 #endif
 
             ggml_backend_buffer_free(galloc->buffers[i]);
-            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
+            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], dry_run ? 0 : new_size);
             if (galloc->buffers[i] == NULL) {
                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                 return false;
@@ -781,7 +785,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 }
 
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
-    return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
+    return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL, /*dry_run =*/ false);
 }
 
 static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
@@ -934,6 +938,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }
 
+size_t ggml_gallocr_get_max_size(ggml_gallocr_t galloc, ggml_backend_dev_t dev) {
+    for (int i = 0; i < galloc->n_buffers; i++) {
+        if (ggml_backend_buft_get_device(galloc->bufts[i]) == dev) {
+            return ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
+        }
+    }
+    return 0;
+}
+
 // utils
 
 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
@@ -984,14 +997,16 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
     return true;
 }
 
-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
+        struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool dry_run) {
     GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
 
     size_t alignment = ggml_backend_buft_get_alignment(buft);
     size_t max_size = ggml_backend_buft_get_max_size(buft);
 
     ggml_backend_buffer_t * buffers = NULL;
     size_t n_buffers = 0;
+    *nbytes_total = 0;
 
     size_t cur_buf_size = 0;
     struct ggml_tensor * first = ggml_get_first_tensor(ctx);
@@ -1003,10 +1018,13 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
 
         if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
             // allocate tensors in the current buffer
-            if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
-                return NULL;
+            if (!dry_run) {
+                if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
+                    return NULL;
+                }
             }
             first = t;
+            *nbytes_total += cur_buf_size;
             cur_buf_size = this_size;
         } else {
             cur_buf_size += this_size;
@@ -1015,15 +1033,23 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
 
     // allocate remaining tensors
     if (cur_buf_size > 0) {
-        if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
-            return NULL;
+        *nbytes_total += cur_buf_size;
+        if (!dry_run) {
+            if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
+                return NULL;
+            }
         }
     }
 
+    if (dry_run) {
+        return NULL;
+    }
+
     if (n_buffers == 0) {
 #ifndef NDEBUG
         GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
 #endif
+        GGML_ASSERT(!buffers);
         return NULL;
     }
 
@@ -1033,10 +1059,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
     } else {
         buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
     }
-    free(buffers);
+    if (buffers) {
+        free(buffers); // can be NULL if dry_run or context is empty
+    }
     return buffer;
 }
 
+size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+    size_t nbytes_total = 0;
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*dry_run =*/ true);
+    GGML_ASSERT(!buf);
+    return nbytes_total;
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+    size_t nbytes_total = 0;
+    return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*dry_run =*/ false);
+}
+
 ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
     return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
 }
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -1347,7 +1347,8 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
 #ifndef NDEBUG
         GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
 #endif
-        ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
+        ggml_gallocr_reserve_n(sched->galloc, &sched->graph,
+            sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ false);
         if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
             GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
             return false;
@@ -1546,14 +1547,31 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
     sched->is_alloc = false;
 }
 
+void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
+    GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
+
+    ggml_backend_sched_split_graph(sched, measure_graph);
+
+    ggml_backend_sched_synchronize(sched);
+
+    GGML_ASSERT(ggml_gallocr_reserve_n(sched->galloc, &sched->graph,
+            sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ true));
+    for (int ib = 0; ib < sched->n_backends; ib++) {
+        sizes[ib] = ggml_gallocr_get_max_size(sched->galloc, ggml_backend_get_device(sched->backends[ib]));
+    }
+
+    ggml_backend_sched_reset(sched);
+}
+
 bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
     GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
 
     ggml_backend_sched_split_graph(sched, measure_graph);
 
     ggml_backend_sched_synchronize(sched);
 
-    if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
+    if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph,
+            sched->node_backend_ids, sched->leaf_backend_ids, /*dry_run =*/ false)) {
         return false;
     }
 

diff --git a/include/llama.h b/include/llama.h
@@ -414,6 +414,13 @@ extern "C" {
     LLAMA_API struct llama_sampler_chain_params  llama_sampler_chain_default_params(void);
     LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
 
+    // returns success
+    LLAMA_API bool llama_expected_memory_use(
+                             const char * path_model,
+              struct llama_model_params   mparams,
+            struct llama_context_params   cparams,
+                                 size_t * nbytes_expect);
+
     // Initialize the llama + ggml backend
     // If numa is true, use NUMA optimizations
     // Call once at the start of the program