@@ -108,6 +108,13 @@ The llama_sampler_i interface as been extended with 4 new methods in the API,
108108and they are currently all named with a `_ggml` suffix to indicate that they
109109are for GPU sampling:
110110```c++
111+ void (*init_ggml)(struct llama_sampler * smpl,
112+ ggml_backend_buffer_type_t buft);
113+
114+ void (*set_input_ggml)( struct llama_sampler * smpl,
115+ ggml_context * ctx,
116+ ggml_cgraph * gf);
117+
111118 void (*apply_ggml)( struct llama_sampler * smpl,
112119 ggml_context * ctx,
113120 ggml_cgraph * gf,
@@ -118,21 +125,68 @@ are for GPU sampling:
118125 ggml_cgraph * gf,
119126 struct ggml_tensor * selected_token);
120127
121- void (*set_input_ggml)( struct llama_sampler * smpl,
122- ggml_context * ctx,
123- ggml_cgraph * gf);
128+ ```
129+ The _ init_ggml_ function allows GPU samplers to create input tensors that they
130+ might need. The ggml_backend_buffer_type should be used so that the tensors are
131+ created using this backend buffer type, which is the same as the ouput logits
132+ backend. This avoids splits in the computation graph that would require data
133+ transfer between different backends.
134+ For example:
135+ ``` c++
136+ struct llama_sampler_gpu_dist_ctx {
137+ const uint32_t seed;
138+ uint32_t seed_cur;
139+ std::mt19937 rng;
140+
141+ struct ggml_tensor * uniform;
142+ struct ggml_context * ctx;
143+ ggml_backend_buffer_t buffer;
144+ };
145+
146+ static void llama_sampler_gpu_dist_init_ggml(
147+ struct llama_sampler * smpl,
148+ ggml_backend_buffer_type_t buft) {
149+
150+ auto * sctx = (llama_sampler_gpu_dist_ctx *) smpl->ctx;
151+ ggml_init_params params = {
152+ /*.mem_size =*/ ggml_tensor_overhead(),
153+ /*.mem_buffer =*/ nullptr,
154+ /*.no_alloc =*/ true,
155+ };
156+ sctx->ctx = ggml_init(params);
157+
158+ // Create the uniform random scalar input tensor. This will be set by
159+ // llama_sampler_gpu_dist_set_input_ggml after this graph is built.
160+ sctx->uniform = ggml_new_tensor_1d(sctx->ctx, GGML_TYPE_F32, 1);
161+ ggml_set_name(sctx->uniform, "uniform");
162+ ggml_set_input(sctx->uniform);
163+ ggml_set_output(sctx->uniform);
164+
165+ // Allocate all tensors from our context to the backend
166+ sctx->buffer = ggml_backend_alloc_ctx_tensors_from_buft(sctx->ctx, buft);
167+ }
168+ ```
169+
170+ The _set_input_ggml_ function is called after the computation graph has been
171+ scheduled but before it is computed. This allows the GPU sampler to set any
172+ input for the tensors it created in init_ggml.
173+ ```c++
174+ static void llama_sampler_gpu_dist_set_input_ggml(struct llama_sampler * smpl) {
175+ auto * sctx = (llama_sampler_gpu_dist_ctx *) smpl->ctx;
176+ GGML_ASSERT(sctx->uniform != nullptr);
124177
125- void (*set_backend_context)( struct llama_sampler * smpl,
126- ggml_backend_sched_t sched,
127- ggml_backend_t backend);
178+ std::uniform_real_distribution<float> dist(0.0f, 1.0f);
179+ const float rnd = dist(sctx->rng);
180+ ggml_backend_tensor_set(sctx->uniform, &rnd, 0, sizeof(float));
181+ }
128182```
129- set_backenck_context function is use to enable the GPU sampler to know which
130- backend the tensors that it creates/uses should be created on. This is important
131- so that we avoid splits in the computation graph that would require data transfer
132- between different backends.
133183
134- apply_ggml is where the GPU sampler adds its operations to the graphs. For
135- example the greedy sampler will select the token with the highest probability:
184+ The _ apply_ggml_ function is where the GPU sampler adds its operations to the
185+ graphs. When the graph is built, the configured sampler's _ apply function is
186+ called which allows them to add operations/nodes to the computation graph.
187+
188+ The _ accept_ggml_ functions allows GPU samplers to update their tensor states if needed.
189+
136190``` c++
137191static void llama_sampler_gpu_greedy_apply_ggml (
138192 struct llama_sampler * smpl,
@@ -168,6 +222,35 @@ and it uncovered some isseus that the tests missed.
168222The pull request can be found here:
169223https://github.com/ggml-org/llama.cpp/pull/17004
170224
225+
226+ #### Setting/unsetting a GPU sampler
227+ Currently the samplers are configured for a specific sequence id which happens
228+ at the same time that the context is created.
229+ ```c++
230+ std::unordered_map<llama_seq_id, llama_sampler*> samplers;
231+ ```
232+ In the llama_context constructor we have the following:
233+ ``` c++
234+ llama_context::llama_context (
235+ const llama_model & model,
236+ llama_context_params params) :
237+ model(model),
238+ balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
239+ ...
240+
241+ // GPU samplers
242+ if (params.samplers != nullptr && params.n_samplers > 0) {
243+ samplers.reserve(params.n_samplers);
244+
245+ for (size_t i = 0; i < params.n_samplers; ++i) {
246+ const auto & config = params.samplers[i];
247+ samplers[config.seq_id] = config.sampler;
248+ }
249+ }
250+ ```
251+ Now, we might want to unset or change the GPU sampler for a specific sequence.
252+ Unsetting would just be clearing that so lets start with that functionality.
253+
171254----
172255
173256The sections below contains some notes taken during the initial design and
0 commit comments