Skip to content

Commit 7aea1d7

Browse files
committed
clean up unused llava functions, fix qwen3vl loading
1 parent 008405e commit 7aea1d7

File tree

3 files changed

+2
-333
lines changed

3 files changed

+2
-333
lines changed

gpttype_adapter.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1811,7 +1811,7 @@ static void load_grammar(const std::string & gammarstr)
18111811
}
18121812

18131813
static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num_img_tokens, int n_batch, int * n_past) {
1814-
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
1814+
int n_embd = llama_model_n_embd_inp(llama_get_model(ctx_llama));
18151815

18161816
for (int i = 0; i < num_img_tokens; i += n_batch) {
18171817
int n_eval = num_img_tokens - i;
@@ -2489,7 +2489,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
24892489
fprintf(stderr, "%s: error: failed to load mmproj model!\n", __func__);
24902490
return ModelLoadResult::FAIL;
24912491
}
2492-
const int n_embd_llm = llama_n_embd(llamamodel);
2492+
const int n_embd_llm = llama_model_n_embd_inp(llamamodel);
24932493
int n_embd_clip_a = -1;
24942494
int n_embd_clip_v = -1;
24952495
if (clp_ctx_v)

tools/mtmd/llava.cpp

Lines changed: 0 additions & 318 deletions
Original file line numberDiff line numberDiff line change
@@ -15,219 +15,13 @@
1515
#include <vector>
1616
#include <memory>
1717

18-
struct clip_image_grid_shape {
19-
int first;
20-
int second;
21-
};
2218

2319
// convenience cpp wrapper
2420
struct clip_image_f32_batch_deleter {
2521
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
2622
};
2723
typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
2824

29-
/**
30-
* Selects the best resolution from a list of possible resolutions based on the original size.
31-
*
32-
* @param original_size The original size of the image in the format (width, height).
33-
* @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
34-
* @return The best fit resolution in the format (width, height).
35-
*/
36-
static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
37-
int original_width = original_size.first;
38-
int original_height = original_size.second;
39-
40-
std::pair<int, int> best_fit;
41-
int max_effective_resolution = 0;
42-
int min_wasted_resolution = std::numeric_limits<int>::max();
43-
44-
for (const auto& resolution : possible_resolutions) {
45-
int width = resolution.first;
46-
int height = resolution.second;
47-
float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
48-
int downscaled_width = static_cast<int>(original_width * scale);
49-
int downscaled_height = static_cast<int>(original_height * scale);
50-
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
51-
int wasted_resolution = (width * height) - effective_resolution;
52-
// LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
53-
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
54-
max_effective_resolution = effective_resolution;
55-
min_wasted_resolution = wasted_resolution;
56-
best_fit = resolution;
57-
}
58-
}
59-
60-
return best_fit;
61-
}
62-
63-
/**
64-
* @brief Get the anyres image grid shape object
65-
*
66-
* @param image_size
67-
* @param grid_pinpoints
68-
* @param image_patch_size
69-
* @return <int, int>
70-
*/
71-
static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) {
72-
/**
73-
Conversion from gguf flat array to vector:
74-
std::vector<std::pair<int, int>> possible_resolutions;
75-
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
76-
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
77-
}
78-
*/
79-
auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
80-
return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
81-
}
82-
83-
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
84-
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
85-
struct {
86-
struct ggml_context * ctx;
87-
} model;
88-
89-
const int32_t image_size = clip_get_image_size(ctx_clip);
90-
const int32_t patch_size = clip_get_patch_size(ctx_clip);
91-
92-
int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
93-
94-
int num_patches_width = grid_shape.first; // grid 1-4
95-
int num_patches_height = grid_shape.second; // grid 1-4
96-
97-
const size_t num_images = num_patches_width * num_patches_height + 1;
98-
99-
// TODO: size calculation is not calculated - it's only tens of MB
100-
size_t ctx_size = 0;
101-
102-
{
103-
ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
104-
ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
105-
}
106-
107-
struct ggml_init_params params {
108-
/*.mem_size =*/ ctx_size,
109-
/*.mem_buffer =*/ NULL,
110-
/*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API
111-
};
112-
113-
// Python reference code for full unpad:
114-
/*
115-
base_image_feature = image_feature[0]
116-
image_feature = image_feature[1:]
117-
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
118-
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
119-
image_feature = unpad_image(image_feature, image_sizes[image_idx])
120-
image_feature = torch.cat((
121-
image_feature,
122-
self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
123-
), dim=-1)
124-
image_feature = image_feature.flatten(1, 2).transpose(0, 1)
125-
image_feature = torch.cat((base_image_feature, image_feature), dim=0)
126-
*/
127-
// We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
128-
// In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
129-
// Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
130-
// Once all images are processed to prepended the base_image_features without any changes.
131-
132-
// Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
133-
/*
134-
image_feature = image_feature.view(2, 2, 24, 24, 4096)
135-
image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
136-
image_feature = image_feature.view(2, 24, 2, 24, 4096)
137-
image_feature = image_feature.flatten(0, 3)
138-
139-
// Reshape to 4D tensor by merging the last two dimensions
140-
image_feature = image_feature.view(2, 2, 24, 24*4096)
141-
image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
142-
image_feature = image_feature.view(-1, 4096)
143-
*/
144-
145-
model.ctx = ggml_init(params);
146-
147-
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
148-
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
149-
// fill it with the image embeddings, ignoring the base
150-
for (size_t i = 1; i < num_images; i++) {
151-
size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
152-
memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
153-
}
154-
155-
struct ggml_cgraph * gf = ggml_new_graph(model.ctx);
156-
size_t size_ele = ggml_type_size(GGML_TYPE_F32);
157-
158-
struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
159-
num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
160-
num_patches_per_side,
161-
num_patches_width,
162-
num_patches_height,
163-
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
164-
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
165-
size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
166-
// ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
167-
struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
168-
/**
169-
At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
170-
image_feature = torch.cat((
171-
image_feature,
172-
self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
173-
), dim=-1)
174-
*
175-
*/
176-
177-
// ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
178-
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
179-
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
180-
ggml_build_forward_expand(gf, flatten);
181-
182-
ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
183-
GGML_ASSERT(backend != nullptr && "failed to initialize CPU backend");
184-
ggml_backend_graph_compute(backend.get(), gf);
185-
186-
struct ggml_tensor* result = ggml_graph_node(gf, -1);
187-
188-
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
189-
// append without newline tokens (default behavior in llava_arch when not using unpad ):
190-
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
191-
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
192-
193-
// Debug: Test single segments
194-
// Current findings: sending base image, sending a segment embedding all works similar to python
195-
// However, permuted embeddings do not work yet (stride issue?)
196-
// memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
197-
// memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
198-
// *n_img_pos_out=576;
199-
200-
ggml_free(model.ctx);
201-
return true;
202-
}
203-
204-
static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
205-
int width = image->nx;
206-
int height = image->ny;
207-
int num_patches = (height / patch_size) * (width / patch_size);
208-
clip_image_f32 * patch = clip_image_f32_init();
209-
patch->nx = patch_size * num_patches;
210-
patch->ny = patch_size;
211-
patch->buf.resize(3 * patch->nx * patch->ny);
212-
213-
int patch_index = 0;
214-
215-
for (int i = 0; i < height; i += patch_size) {
216-
for (int j = 0; j < width; j += patch_size) {
217-
for (int pi = 0; pi < patch_size; ++pi) {
218-
for (int pj = 0; pj < patch_size; ++pj) {
219-
int input_index = ((i + pi) * width + (j + pj)) * 3;
220-
int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
221-
patch->buf[output_index] = image->buf[input_index];
222-
patch->buf[output_index+1] = image->buf[input_index+1];
223-
patch->buf[output_index+2] = image->buf[input_index+2];
224-
}
225-
}
226-
patch_index++;
227-
}
228-
}
229-
return patch;
230-
}
23125

23226
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, struct clip_image_f32_batch * preprocessed_img, float * image_embd, int * n_img_pos) {
23327

@@ -255,17 +49,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, struct cl
25549
return true;
25650
}
25751

258-
bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
259-
// make sure that the correct mmproj was used, i.e., compare apples to apples
260-
int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama));
261-
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
262-
if (n_image_embd != n_llama_embd) {
263-
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
264-
return false;
265-
}
266-
return true;
267-
}
268-
26952
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
27053
// Granite vision uses up to 10 patches + base patch
27154
int num_max_patches = 11;
@@ -348,107 +131,6 @@ struct llava_embd_batch {
348131
}
349132
};
350133

351-
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
352-
int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
353-
354-
for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
355-
int n_eval = image_embed->n_image_pos - i;
356-
if (n_eval > n_batch) {
357-
n_eval = n_batch;
358-
}
359-
float * embd = image_embed->embed+i*n_embd;
360-
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
361-
if (llama_decode(ctx_llama, llava_batch.batch)) {
362-
LOG_ERR("%s : failed to eval\n", __func__);
363-
return false;
364-
}
365-
*n_past += n_eval;
366-
}
367-
return true;
368-
}
369-
370-
struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
371-
clip_image_u8 * img = clip_image_u8_init();
372-
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img, 2048)) {
373-
clip_image_u8_free(img);
374-
LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
375-
return NULL;
376-
}
377-
378-
float* image_embed = NULL;
379-
int n_image_pos = 0;
380-
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
381-
if (!image_embed_result) {
382-
clip_image_u8_free(img);
383-
LOG_ERR("%s: couldn't embed the image\n", __func__);
384-
return NULL;
385-
}
386-
387-
clip_image_u8_free(img);
388-
auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
389-
result->embed = image_embed;
390-
result->n_image_pos = n_image_pos;
391-
return result;
392-
}
393-
394-
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
395-
auto file = fopen(path, "rb");
396-
if (file == NULL) {
397-
LOG_ERR("%s: can't read file %s\n", __func__, path);
398-
return false;
399-
}
400-
401-
fseek(file, 0, SEEK_END);
402-
auto fileSize = ftell(file);
403-
fseek(file, 0, SEEK_SET);
404-
405-
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
406-
if (buffer == NULL) {
407-
LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
408-
perror("Memory allocation error");
409-
fclose(file);
410-
return false;
411-
}
412-
errno = 0;
413-
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
414-
if (ferror(file)) {
415-
LOG_ERR("read error: %s", strerror(errno));
416-
free(buffer);
417-
fclose(file);
418-
return false;
419-
}
420-
if (ret != (size_t) fileSize) {
421-
LOG_ERR("unexpectedly reached end of file");
422-
free(buffer);
423-
fclose(file);
424-
return false;
425-
}
426-
fclose(file); // Close the file
427-
428-
*bytesOut = buffer;
429-
*sizeOut = fileSize;
430-
return true;
431-
}
432-
433-
struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
434-
unsigned char* image_bytes;
435-
long image_bytes_length;
436-
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
437-
if (!loaded) {
438-
LOG_ERR("%s: failed to load %s\n", __func__, image_path);
439-
return NULL;
440-
}
441-
442-
llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
443-
free(image_bytes);
444-
445-
return embed;
446-
}
447-
448-
void llava_image_embed_free(struct llava_image_embed * embed) {
449-
free(embed->embed);
450-
free(embed);
451-
}
452134

453135
//kcpp helper function
454136
bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const whisper_preprocessor::whisper_mel & mel_spec, float ** image_embd_out, int * n_img_pos_out)

tools/mtmd/llava.h

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,21 +30,8 @@ namespace whisper_preprocessor {
3030
struct whisper_mel;
3131
}
3232

33-
/** sanity check for clip <-> llava embed size match */
34-
LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
35-
3633
LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
3734

38-
/** build an image embed from image file bytes */
39-
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
40-
/** build an image embed from a path to an image filename */
41-
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
42-
/** free an embedding made with llava_image_embed_make_* */
43-
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
44-
45-
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
46-
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
47-
4835
LLAVA_API bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const whisper_preprocessor::whisper_mel & mel_spec, float ** image_embd_out, int * n_img_pos_out);
4936

5037

0 commit comments

Comments
 (0)