From 800d8e31b827a6239143a2434e90722e5a7c38c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 7 Jul 2025 12:32:13 +0200 Subject: [PATCH 01/16] Instruct-p2p support --- model.cpp | 4 ++++ model.h | 3 ++- stable-diffusion.cpp | 16 ++++++++++++++++ unet.hpp | 2 ++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/model.cpp b/model.cpp index 85c959057..5106e69cb 100644 --- a/model.cpp +++ b/model.cpp @@ -1674,6 +1674,7 @@ SDVersion ModelLoader::get_sd_version() { } } bool is_inpaint = input_block_weight.ne[2] == 9; + bool is_ip2p = input_block_weight.ne[2] == 8; if (is_xl) { if (is_inpaint) { return VERSION_SDXL_INPAINT; @@ -1693,6 +1694,9 @@ SDVersion ModelLoader::get_sd_version() { if (is_inpaint) { return VERSION_SD1_INPAINT; } + if(is_ip2p) { + return VERSION_INSTRUCT_PIX2PIX; + } return VERSION_SD1; } else if (token_embedding_weight.ne[0] == 1024) { if (is_inpaint) { diff --git a/model.h b/model.h index 82885dd96..8a5e0e27a 100644 --- a/model.h +++ b/model.h @@ -21,6 +21,7 @@ enum SDVersion { VERSION_SD1, VERSION_SD1_INPAINT, + VERSION_INSTRUCT_PIX2PIX, VERSION_SD2, VERSION_SD2_INPAINT, VERSION_SDXL, @@ -47,7 +48,7 @@ static inline bool sd_version_is_sd3(SDVersion version) { } static inline bool sd_version_is_sd1(SDVersion version) { - if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) { + if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_INSTRUCT_PIX2PIX) { return true; } return false; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 9c8265727..2111d3e53 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -27,6 +27,7 @@ const char* model_version_to_str[] = { "SD 1.x", "SD 1.x Inpaint", + "Instruct-Pix2Pix", "SD 2.x", "SD 2.x Inpaint", "SDXL", @@ -1476,9 +1477,16 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, } cond.c_concat = masked_image; uncond.c_concat = masked_image; + // noise_mask = masked_image; + } else if (sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX) { + cond.c_concat = masked_image; + auto empty_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image->ne[0], masked_image->ne[1], masked_image->ne[2], masked_image->ne[3]); + ggml_set_f32(empty_img, 0); + uncond.c_concat = empty_img; } else { noise_mask = masked_image; } + for (int b = 0; b < batch_count; b++) { int64_t sampling_start = ggml_time_ms(); int64_t cur_seed = seed + b; @@ -1801,6 +1809,14 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } } } + } else if (sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX) { + // Not actually masked, we're just highjacking the masked_image variable since it will be used the same way + if (!sd_ctx->sd->use_tiny_autoencoder) { + ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + masked_image = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + } else { + masked_image = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + } } else { // LOG_WARN("Inpainting with a base model is not great"); masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1); diff --git a/unet.hpp b/unet.hpp index 31b7fe986..fdbd82453 100644 --- a/unet.hpp +++ b/unet.hpp @@ -207,6 +207,8 @@ class UnetModelBlock : public GGMLBlock { } if (sd_version_is_inpaint(version)) { in_channels = 9; + } else if (version == VERSION_INSTRUCT_PIX2PIX) { + in_channels = 8; } // dims is always 2 From 70625068217416825f1a039c41d2f518ac910f4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 7 Jul 2025 12:32:18 +0200 Subject: [PATCH 02/16] support 2 conditionings cfg --- stable-diffusion.cpp | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 2111d3e53..03e8035e8 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -841,6 +841,11 @@ class StableDiffusionGGML { float skip_layer_start = 0.01, float skip_layer_end = 0.2, ggml_tensor* noise_mask = nullptr) { + + // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance) + + float img_cfg_scale = guidance; + LOG_DEBUG("Sample"); struct ggml_init_params params; size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); @@ -863,12 +868,15 @@ class StableDiffusionGGML { struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise); bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL; + bool has_img_guidance = version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != img_cfg_scale; + has_unconditioned = has_unconditioned || has_img_guidance; bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; // denoise wrapper - struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* out_uncond = NULL; - struct ggml_tensor* out_skip = NULL; + struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* out_uncond = NULL; + struct ggml_tensor* out_skip = NULL; + struct ggml_tensor* out_img_cond = NULL; if (has_unconditioned) { out_uncond = ggml_dup_tensor(work_ctx, x); @@ -881,6 +889,9 @@ class StableDiffusionGGML { LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]); } } + if (has_img_guidance) { + out_img_cond = ggml_dup_tensor(work_ctx, x); + } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { @@ -965,6 +976,23 @@ class StableDiffusionGGML { negative_data = (float*)out_uncond->data; } + float* img_cond_data = NULL; + if (has_img_guidance) { + diffusion_model->compute(n_threads, + noised_input, + timesteps, + uncond.c_crossattn, + cond.c_concat, + uncond.c_vector, + guidance_tensor, + ref_latents, + -1, + controls, + control_strength, + &out_img_cond); + img_cond_data = (float*)out_img_cond->data; + } + int step_count = sigmas.size(); bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count); float* skip_layer_data = NULL; @@ -1000,7 +1028,11 @@ class StableDiffusionGGML { int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2]; float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3); } else { - latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); + if(has_img_guidance){ + latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]); + } else { + latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); + } } } if (is_skiplayer_step) { @@ -1408,7 +1440,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, sd_ctx->sd->diffusion_model->get_adm_in_channels()); SDCondition uncond; - if (cfg_scale != 1.0) { + if (cfg_scale != 1.0 || sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX && cfg_scale!=guidance) { bool force_zero_embeddings = false; if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) { force_zero_embeddings = true; From 0cf0b80fb4d88aff167be629d7f6be6761084150 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 7 Jul 2025 12:32:22 +0200 Subject: [PATCH 03/16] Do not re-encode the exact same image twice --- stable-diffusion.cpp | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 03e8035e8..1337df079 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -841,7 +841,6 @@ class StableDiffusionGGML { float skip_layer_start = 0.01, float skip_layer_end = 0.2, ggml_tensor* noise_mask = nullptr) { - // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance) float img_cfg_scale = guidance; @@ -869,7 +868,7 @@ class StableDiffusionGGML { bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL; bool has_img_guidance = version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != img_cfg_scale; - has_unconditioned = has_unconditioned || has_img_guidance; + has_unconditioned = has_unconditioned || has_img_guidance; bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; // denoise wrapper @@ -1028,7 +1027,7 @@ class StableDiffusionGGML { int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2]; float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3); } else { - if(has_img_guidance){ + if (has_img_guidance) { latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]); } else { latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); @@ -1440,7 +1439,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, sd_ctx->sd->diffusion_model->get_adm_in_channels()); SDCondition uncond; - if (cfg_scale != 1.0 || sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX && cfg_scale!=guidance) { + if (cfg_scale != 1.0 || sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != guidance) { bool force_zero_embeddings = false; if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) { force_zero_embeddings = true; @@ -1796,6 +1795,14 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_to_tensor(init_image.data, init_img); + ggml_tensor* init_latent = NULL; + if (!sd_ctx->sd->use_tiny_autoencoder) { + ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + } else { + init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + } + ggml_tensor* masked_image; if (sd_version_is_inpaint(sd_ctx->sd->version)) { @@ -1843,12 +1850,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } } else if (sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX) { // Not actually masked, we're just highjacking the masked_image variable since it will be used the same way - if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - masked_image = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); - } else { - masked_image = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - } + masked_image = init_latent; } else { // LOG_WARN("Inpainting with a base model is not great"); masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1); @@ -1862,14 +1864,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } } - ggml_tensor* init_latent = NULL; - if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); - } else { - init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - } - print_ggml_tensor(init_latent, true); size_t t1 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); From 2373c08f7ba72e8251a48bbd689b6f572c004483 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 7 Jul 2025 12:32:25 +0200 Subject: [PATCH 04/16] fixes for 2-cfg --- model.h | 4 ++++ stable-diffusion.cpp | 13 ++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/model.h b/model.h index 8a5e0e27a..8e00a6108 100644 --- a/model.h +++ b/model.h @@ -82,6 +82,10 @@ static inline bool sd_version_is_dit(SDVersion version) { return false; } +static bool sd_version_use_concat(SDVersion version) { + return version == VERSION_INSTRUCT_PIX2PIX || sd_version_is_inpaint(version); +} + enum PMVersion { PM_VERSION_1, PM_VERSION_2, diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 1337df079..601326aa6 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -844,6 +844,10 @@ class StableDiffusionGGML { // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance) float img_cfg_scale = guidance; + if (img_cfg_scale != cfg_scale && !sd_version_use_concat(version)) { + LOG_WARN("2-conditioning CFG is not supported with this model, disabling it..."); + img_cfg_scale = cfg_scale; + } LOG_DEBUG("Sample"); struct ggml_init_params params; @@ -866,9 +870,8 @@ class StableDiffusionGGML { struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise); - bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL; - bool has_img_guidance = version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != img_cfg_scale; - has_unconditioned = has_unconditioned || has_img_guidance; + bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != NULL; + bool has_img_guidance = cfg_scale != img_cfg_scale && uncond.c_crossattn != NULL; bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; // denoise wrapper @@ -1030,9 +1033,13 @@ class StableDiffusionGGML { if (has_img_guidance) { latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]); } else { + // img_cfg_scale == cfg_scale latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); } } + } else if(has_img_guidance){ + // img_cfg_scale == 1 + latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]); } if (is_skiplayer_step) { latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale; From 1e9bfc3bb1aabb671a54e9d854abe1a9fa330cf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 7 Jul 2025 12:32:27 +0200 Subject: [PATCH 05/16] Fix pix2pix latent inputs + improve inpainting a bit + fix naming --- stable-diffusion.cpp | 146 ++++++++++++++++++++++++++----------------- 1 file changed, 89 insertions(+), 57 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 601326aa6..9b1d54a81 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1113,6 +1113,30 @@ class StableDiffusionGGML { return latent; } + ggml_tensor* + get_first_stage_encoding_mode(ggml_context* work_ctx, ggml_tensor* moments) { + // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample + ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); + struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); + ggml_tensor_set_f32_randn(noise, rng); + // noise = load_tensor_from_file(work_ctx, "noise.bin"); + { + float mean = 0; + for (int i = 0; i < latent->ne[3]; i++) { + for (int j = 0; j < latent->ne[2]; j++) { + for (int k = 0; k < latent->ne[1]; k++) { + for (int l = 0; l < latent->ne[0]; l++) { + // mode and mean are the same for gaussians + mean = ggml_tensor_get_f32(moments, l, k, j, i); + ggml_tensor_set_f32(latent, mean, l, k, j, i); + } + } + } + } + } + return latent; + } + ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) { int64_t W = x->ne[0]; int64_t H = x->ne[1]; @@ -1298,7 +1322,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - ggml_tensor* masked_image = NULL) { + ggml_tensor* masked_latent = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1487,42 +1511,43 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); ggml_tensor* noise_mask = nullptr; if (sd_version_is_inpaint(sd_ctx->sd->version)) { - if (masked_image == NULL) { - int64_t mask_channels = 1; - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - mask_channels = 8 * 8; // flatten the whole mask - } - // no mask, set the whole image as masked - masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1); - for (int64_t x = 0; x < masked_image->ne[0]; x++) { - for (int64_t y = 0; y < masked_image->ne[1]; y++) { - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - // TODO: this might be wrong - for (int64_t c = 0; c < init_latent->ne[2]; c++) { - ggml_tensor_set_f32(masked_image, 0, x, y, c); - } - for (int64_t c = init_latent->ne[2]; c < masked_image->ne[2]; c++) { - ggml_tensor_set_f32(masked_image, 1, x, y, c); - } - } else { - ggml_tensor_set_f32(masked_image, 1, x, y, 0); - for (int64_t c = 1; c < masked_image->ne[2]; c++) { - ggml_tensor_set_f32(masked_image, 0, x, y, c); - } + int64_t mask_channels = 1; + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + mask_channels = 8 * 8; // flatten the whole mask + } + auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1); + // no mask, set the whole image as masked + for (int64_t x = 0; x < empty_latent->ne[0]; x++) { + for (int64_t y = 0; y < empty_latent->ne[1]; y++) { + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + // TODO: this might be wrong + for (int64_t c = 0; c < init_latent->ne[2]; c++) { + ggml_tensor_set_f32(empty_latent, 0, x, y, c); + } + for (int64_t c = init_latent->ne[2]; c < empty_latent->ne[2]; c++) { + ggml_tensor_set_f32(empty_latent, 1, x, y, c); + } + } else { + ggml_tensor_set_f32(empty_latent, 1, x, y, 0); + for (int64_t c = 1; c < empty_latent->ne[2]; c++) { + ggml_tensor_set_f32(empty_latent, 0, x, y, c); } } } } - cond.c_concat = masked_image; - uncond.c_concat = masked_image; - // noise_mask = masked_image; + if (masked_latent == NULL) { + masked_latent = empty_latent; + } + cond.c_concat = masked_latent; + uncond.c_concat = empty_latent; + // noise_mask = masked_latent; } else if (sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX) { - cond.c_concat = masked_image; - auto empty_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image->ne[0], masked_image->ne[1], masked_image->ne[2], masked_image->ne[3]); - ggml_set_f32(empty_img, 0); - uncond.c_concat = empty_img; + cond.c_concat = masked_latent; + auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent->ne[0], masked_latent->ne[1], masked_latent->ne[2], masked_latent->ne[3]); + ggml_set_f32(empty_latent, 0); + uncond.c_concat = empty_latent; } else { - noise_mask = masked_image; + noise_mask = masked_latent; } for (int b = 0; b < batch_count; b++) { @@ -1802,39 +1827,42 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_to_tensor(init_image.data, init_img); - ggml_tensor* init_latent = NULL; + ggml_tensor* masked_latent; + + ggml_tensor* init_latent = NULL; + ggml_tensor* init_moments = NULL; if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + init_moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments); } else { init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); } - ggml_tensor* masked_image; - if (sd_version_is_inpaint(sd_ctx->sd->version)) { int64_t mask_channels = 1; if (sd_ctx->sd->version == VERSION_FLUX_FILL) { mask_channels = 8 * 8; // flatten the whole mask } ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); + // Restore init_img (encode_first_stage has side effects) TODO: remove the side effects? + sd_image_to_tensor(init_image.data, init_img); sd_apply_mask(init_img, mask_img, masked_img); - ggml_tensor* masked_image_0 = NULL; + ggml_tensor* masked_latent_0 = NULL; if (!sd_ctx->sd->use_tiny_autoencoder) { ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); - masked_image_0 = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + masked_latent_0 = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); } else { - masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); + masked_latent_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); } - masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], mask_channels + masked_image_0->ne[2], 1); - for (int ix = 0; ix < masked_image_0->ne[0]; ix++) { - for (int iy = 0; iy < masked_image_0->ne[1]; iy++) { + masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent_0->ne[0], masked_latent_0->ne[1], mask_channels + masked_latent_0->ne[2], 1); + for (int ix = 0; ix < masked_latent_0->ne[0]; ix++) { + for (int iy = 0; iy < masked_latent_0->ne[1]; iy++) { int mx = ix * 8; int my = iy * 8; if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - for (int k = 0; k < masked_image_0->ne[2]; k++) { - float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k); - ggml_tensor_set_f32(masked_image, v, ix, iy, k); + for (int k = 0; k < masked_latent_0->ne[2]; k++) { + float v = ggml_tensor_get_f32(masked_latent_0, ix, iy, k); + ggml_tensor_set_f32(masked_latent, v, ix, iy, k); } // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image for (int x = 0; x < 8; x++) { @@ -1842,31 +1870,35 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, float m = ggml_tensor_get_f32(mask_img, mx + x, my + y); // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?) // python code was using "b (h 8) (w 8) -> b (8 8) h w" - ggml_tensor_set_f32(masked_image, m, ix, iy, masked_image_0->ne[2] + x * 8 + y); + ggml_tensor_set_f32(masked_latent, m, ix, iy, masked_latent_0->ne[2] + x * 8 + y); } } } else { float m = ggml_tensor_get_f32(mask_img, mx, my); - ggml_tensor_set_f32(masked_image, m, ix, iy, 0); - for (int k = 0; k < masked_image_0->ne[2]; k++) { - float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k); - ggml_tensor_set_f32(masked_image, v, ix, iy, k + mask_channels); + ggml_tensor_set_f32(masked_latent, m, ix, iy, 0); + for (int k = 0; k < masked_latent_0->ne[2]; k++) { + float v = ggml_tensor_get_f32(masked_latent_0, ix, iy, k); + ggml_tensor_set_f32(masked_latent, v, ix, iy, k + mask_channels); } } } } } else if (sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX) { - // Not actually masked, we're just highjacking the masked_image variable since it will be used the same way - masked_image = init_latent; + // Not actually masked, we're just highjacking the masked_latent variable since it will be used the same way + if (!sd_ctx->sd->use_tiny_autoencoder) { + masked_latent = sd_ctx->sd->get_first_stage_encoding_mode(work_ctx, init_moments); + } else { + masked_latent = init_latent; + } } else { // LOG_WARN("Inpainting with a base model is not great"); - masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1); - for (int ix = 0; ix < masked_image->ne[0]; ix++) { - for (int iy = 0; iy < masked_image->ne[1]; iy++) { + masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1); + for (int ix = 0; ix < masked_latent->ne[0]; ix++) { + for (int iy = 0; iy < masked_latent->ne[1]; iy++) { int mx = ix * 8; int my = iy * 8; float m = ggml_tensor_get_f32(mask_img, mx, my); - ggml_tensor_set_f32(masked_image, m, ix, iy); + ggml_tensor_set_f32(masked_latent, m, ix, iy); } } } @@ -1908,7 +1940,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - masked_image); + masked_latent); size_t t2 = ggml_time_ms(); From 58d5cd96e086c00d58e981a5b091c8ce6f4e7ddc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 7 Jul 2025 12:32:29 +0200 Subject: [PATCH 06/16] prepare for other pix2pix-like models --- model.cpp | 2 +- model.h | 10 +++++++--- stable-diffusion.cpp | 6 +++--- unet.hpp | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/model.cpp b/model.cpp index 5106e69cb..5fb99eef6 100644 --- a/model.cpp +++ b/model.cpp @@ -1695,7 +1695,7 @@ SDVersion ModelLoader::get_sd_version() { return VERSION_SD1_INPAINT; } if(is_ip2p) { - return VERSION_INSTRUCT_PIX2PIX; + return VERSION_SD1_PIX2PIX; } return VERSION_SD1; } else if (token_embedding_weight.ne[0] == 1024) { diff --git a/model.h b/model.h index 8e00a6108..7c83d6ae9 100644 --- a/model.h +++ b/model.h @@ -21,7 +21,7 @@ enum SDVersion { VERSION_SD1, VERSION_SD1_INPAINT, - VERSION_INSTRUCT_PIX2PIX, + VERSION_SD1_PIX2PIX, VERSION_SD2, VERSION_SD2_INPAINT, VERSION_SDXL, @@ -48,7 +48,7 @@ static inline bool sd_version_is_sd3(SDVersion version) { } static inline bool sd_version_is_sd1(SDVersion version) { - if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_INSTRUCT_PIX2PIX) { + if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) { return true; } return false; @@ -82,8 +82,12 @@ static inline bool sd_version_is_dit(SDVersion version) { return false; } +static inline bool sd_version_is_edit(SDVersion version) { + return version == VERSION_SD1_PIX2PIX; +} + static bool sd_version_use_concat(SDVersion version) { - return version == VERSION_INSTRUCT_PIX2PIX || sd_version_is_inpaint(version); + return sd_version_is_edit(version) || sd_version_is_inpaint(version); } enum PMVersion { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 9b1d54a81..de5cc8995 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1470,7 +1470,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, sd_ctx->sd->diffusion_model->get_adm_in_channels()); SDCondition uncond; - if (cfg_scale != 1.0 || sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != guidance) { + if (cfg_scale != 1.0 || sd_version_use_concat(sd_ctx->sd->version) && cfg_scale != guidance) { bool force_zero_embeddings = false; if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) { force_zero_embeddings = true; @@ -1541,7 +1541,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, cond.c_concat = masked_latent; uncond.c_concat = empty_latent; // noise_mask = masked_latent; - } else if (sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX) { + } else if (sd_version_is_edit(sd_ctx->sd->version)) { cond.c_concat = masked_latent; auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent->ne[0], masked_latent->ne[1], masked_latent->ne[2], masked_latent->ne[3]); ggml_set_f32(empty_latent, 0); @@ -1883,7 +1883,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } } } - } else if (sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX) { + } else if (sd_version_is_edit(sd_ctx->sd->version)) { // Not actually masked, we're just highjacking the masked_latent variable since it will be used the same way if (!sd_ctx->sd->use_tiny_autoencoder) { masked_latent = sd_ctx->sd->get_first_stage_encoding_mode(work_ctx, init_moments); diff --git a/unet.hpp b/unet.hpp index fdbd82453..c31d1e833 100644 --- a/unet.hpp +++ b/unet.hpp @@ -207,7 +207,7 @@ class UnetModelBlock : public GGMLBlock { } if (sd_version_is_inpaint(version)) { in_channels = 9; - } else if (version == VERSION_INSTRUCT_PIX2PIX) { + } else if (version == VERSION_SD1_PIX2PIX) { in_channels = 8; } From bf9def4eaa4507f228f5a422db8435b25e9b447e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 7 Jul 2025 12:32:32 +0200 Subject: [PATCH 07/16] Support sdxl ip2p --- model.cpp | 7 +++++-- model.h | 5 +++-- stable-diffusion.cpp | 1 + unet.hpp | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/model.cpp b/model.cpp index 5fb99eef6..afc5b627e 100644 --- a/model.cpp +++ b/model.cpp @@ -1674,11 +1674,14 @@ SDVersion ModelLoader::get_sd_version() { } } bool is_inpaint = input_block_weight.ne[2] == 9; - bool is_ip2p = input_block_weight.ne[2] == 8; + bool is_ip2p = input_block_weight.ne[2] == 8; if (is_xl) { if (is_inpaint) { return VERSION_SDXL_INPAINT; } + if (is_ip2p) { + return VERSION_SDXL_PIX2PIX; + } return VERSION_SDXL; } @@ -1694,7 +1697,7 @@ SDVersion ModelLoader::get_sd_version() { if (is_inpaint) { return VERSION_SD1_INPAINT; } - if(is_ip2p) { + if (is_ip2p) { return VERSION_SD1_PIX2PIX; } return VERSION_SD1; diff --git a/model.h b/model.h index 7c83d6ae9..66037e4e3 100644 --- a/model.h +++ b/model.h @@ -26,6 +26,7 @@ enum SDVersion { VERSION_SD2_INPAINT, VERSION_SDXL, VERSION_SDXL_INPAINT, + VERSION_SDXL_PIX2PIX, VERSION_SVD, VERSION_SD3, VERSION_FLUX, @@ -62,7 +63,7 @@ static inline bool sd_version_is_sd2(SDVersion version) { } static inline bool sd_version_is_sdxl(SDVersion version) { - if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT) { + if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX) { return true; } return false; @@ -83,7 +84,7 @@ static inline bool sd_version_is_dit(SDVersion version) { } static inline bool sd_version_is_edit(SDVersion version) { - return version == VERSION_SD1_PIX2PIX; + return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX; } static bool sd_version_use_concat(SDVersion version) { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index de5cc8995..b981381a3 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -32,6 +32,7 @@ const char* model_version_to_str[] = { "SD 2.x Inpaint", "SDXL", "SDXL Inpaint", + "SDXL Instruct-Pix2Pix", "SVD", "SD3.x", "Flux", diff --git a/unet.hpp b/unet.hpp index c31d1e833..b3fae53a4 100644 --- a/unet.hpp +++ b/unet.hpp @@ -207,7 +207,7 @@ class UnetModelBlock : public GGMLBlock { } if (sd_version_is_inpaint(version)) { in_channels = 9; - } else if (version == VERSION_SD1_PIX2PIX) { + } else if (sd_version_is_edit(version)) { in_channels = 8; } From 48ef621010fad040739260887691a56def435f62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 7 Jul 2025 12:32:35 +0200 Subject: [PATCH 08/16] fix reference image embeddings --- stable-diffusion.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index b981381a3..46e5a7db8 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1887,7 +1887,12 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } else if (sd_version_is_edit(sd_ctx->sd->version)) { // Not actually masked, we're just highjacking the masked_latent variable since it will be used the same way if (!sd_ctx->sd->use_tiny_autoencoder) { - masked_latent = sd_ctx->sd->get_first_stage_encoding_mode(work_ctx, init_moments); + if (sd_ctx->sd->is_using_edm_v_parameterization) { + // for CosXL edit + masked_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments); + } else { + masked_latent = sd_ctx->sd->get_first_stage_encoding_mode(work_ctx, init_moments); + } } else { masked_latent = init_latent; } From db4de5367aa23478e25e3357da3a7def1ec5d23f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 7 Jul 2025 12:32:37 +0200 Subject: [PATCH 09/16] Support 2-cond cfg properly in cli --- examples/cli/main.cpp | 78 +++++++++++++++++-------------- stable-diffusion.cpp | 106 ++++++++++++------------------------------ stable-diffusion.h | 48 +++++++++---------- 3 files changed, 94 insertions(+), 138 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index c7db3708b..0490e59ba 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -96,15 +96,16 @@ struct SDParams { std::string prompt; std::string negative_prompt; - float min_cfg = 1.0f; - float cfg_scale = 7.0f; - float guidance = 3.5f; - float eta = 0.f; - float style_ratio = 20.f; - int clip_skip = -1; // <= 0 represents unspecified - int width = 512; - int height = 512; - int batch_count = 1; + float min_cfg = 1.0f; + float cfg_scale = 7.0f; + float img_cfg_scale = INFINITY; + float guidance = 3.5f; + float eta = 0.f; + float style_ratio = 20.f; + int clip_skip = -1; // <= 0 represents unspecified + int width = 512; + int height = 512; + int batch_count = 1; int video_frames = 6; int motion_bucket_id = 127; @@ -175,6 +176,7 @@ void print_params(SDParams params) { printf(" negative_prompt: %s\n", params.negative_prompt.c_str()); printf(" min_cfg: %.2f\n", params.min_cfg); printf(" cfg_scale: %.2f\n", params.cfg_scale); + printf(" img_cfg_scale: %.2f\n", params.img_cfg_scale); printf(" slg_scale: %.2f\n", params.slg_scale); printf(" guidance: %.2f\n", params.guidance); printf(" eta: %.2f\n", params.eta); @@ -232,7 +234,8 @@ void print_usage(int argc, const char* argv[]) { printf(" -p, --prompt [PROMPT] the prompt to render\n"); printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n"); printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n"); - printf(" --guidance SCALE guidance scale for img2img (default: 3.5)\n"); + printf(" --img_cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n"); + printf(" --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)\n"); printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n"); printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n"); printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n"); @@ -462,6 +465,12 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.cfg_scale = std::stof(argv[i]); + } else if (arg == "--img-cfg-scale") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.img_cfg_scale = std::stof(argv[i]); } else if (arg == "--guidance") { if (++i >= argc) { invalid_arg = true; @@ -743,6 +752,10 @@ void parse_args(int argc, const char** argv, SDParams& params) { params.output_path = "output.gguf"; } } + + if (!isfinite(params.img_cfg_scale)) { + params.img_cfg_scale = params.cfg_scale; + } } static std::string sd_basename(const std::string& path) { @@ -837,6 +850,18 @@ int main(int argc, const char* argv[]) { parse_args(argc, argv, params); + sd_guidance_params_t guidance_params = {params.cfg_scale, + params.img_cfg_scale, + params.min_cfg, + params.guidance, + { + params.skip_layers.data(), + params.skip_layers.size(), + params.skip_layer_start, + params.skip_layer_end, + params.slg_scale, + }}; + sd_set_log_callback(sd_log_cb, (void*)¶ms); if (params.verbose) { @@ -1029,8 +1054,7 @@ int main(int argc, const char* argv[]) { params.prompt.c_str(), params.negative_prompt.c_str(), params.clip_skip, - params.cfg_scale, - params.guidance, + guidance_params, params.eta, params.width, params.height, @@ -1042,12 +1066,7 @@ int main(int argc, const char* argv[]) { params.control_strength, params.style_ratio, params.normalize_input, - params.input_id_images_path.c_str(), - params.skip_layers.data(), - params.skip_layers.size(), - params.slg_scale, - params.skip_layer_start, - params.skip_layer_end); + params.input_id_images_path.c_str()); } else if (params.mode == IMG2IMG || params.mode == IMG2VID) { sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, @@ -1063,8 +1082,7 @@ int main(int argc, const char* argv[]) { params.motion_bucket_id, params.fps, params.augmentation_level, - params.min_cfg, - params.cfg_scale, + guidance_params, params.sample_method, params.sample_steps, params.strength, @@ -1097,8 +1115,7 @@ int main(int argc, const char* argv[]) { params.prompt.c_str(), params.negative_prompt.c_str(), params.clip_skip, - params.cfg_scale, - params.guidance, + guidance_params, params.eta, params.width, params.height, @@ -1111,12 +1128,7 @@ int main(int argc, const char* argv[]) { params.control_strength, params.style_ratio, params.normalize_input, - params.input_id_images_path.c_str(), - params.skip_layers.data(), - params.skip_layers.size(), - params.slg_scale, - params.skip_layer_start, - params.skip_layer_end); + params.input_id_images_path.c_str()); } } else { // EDIT results = edit(sd_ctx, @@ -1125,25 +1137,19 @@ int main(int argc, const char* argv[]) { params.prompt.c_str(), params.negative_prompt.c_str(), params.clip_skip, - params.cfg_scale, - params.guidance, + guidance_params, params.eta, params.width, params.height, params.sample_method, params.sample_steps, - params.strength, params.seed, params.batch_count, control_image, params.control_strength, params.style_ratio, params.normalize_input, - params.skip_layers.data(), - params.skip_layers.size(), - params.slg_scale, - params.skip_layer_start, - params.skip_layer_end); + params.input_id_images_path.c_str()); } if (results == NULL) { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 46e5a7db8..4d7bdf522 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -828,25 +828,26 @@ class StableDiffusionGGML { SDCondition uncond, ggml_tensor* control_hint, float control_strength, - float min_cfg, - float cfg_scale, - float guidance, + sd_guidance_params_t guidance, float eta, sample_method_t method, const std::vector& sigmas, int start_merge_step, SDCondition id_cond, std::vector ref_latents = {}, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, ggml_tensor* noise_mask = nullptr) { + std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); + // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance) - float img_cfg_scale = guidance; + float cfg_scale = guidance.txt_cfg; + float img_cfg_scale = guidance.img_cfg; + float slg_scale = guidance.slg.scale; + + float min_cfg = guidance.min_cfg; + if (img_cfg_scale != cfg_scale && !sd_version_use_concat(version)) { - LOG_WARN("2-conditioning CFG is not supported with this model, disabling it..."); + LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance..."); img_cfg_scale = cfg_scale; } @@ -912,7 +913,7 @@ class StableDiffusionGGML { float t = denoiser->sigma_to_t(sigma); std::vector timesteps_vec(x->ne[3], t); // [N, ] auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); - std::vector guidance_vec(x->ne[3], guidance); + std::vector guidance_vec(x->ne[3], guidance.distilled_guidance); auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); copy_ggml_tensor(noised_input, input); @@ -997,7 +998,7 @@ class StableDiffusionGGML { } int step_count = sigmas.size(); - bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count); + bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count); float* skip_layer_data = NULL; if (is_skiplayer_step) { LOG_DEBUG("Skipping layers at step %d\n", step); @@ -1038,7 +1039,7 @@ class StableDiffusionGGML { latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); } } - } else if(has_img_guidance){ + } else if (has_img_guidance) { // img_cfg_scale == 1 latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]); } @@ -1304,8 +1305,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, std::string prompt, std::string negative_prompt, int clip_skip, - float cfg_scale, - float guidance, + sd_guidance_params_t guidance, float eta, int width, int height, @@ -1319,11 +1319,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, bool normalize_input, std::string input_id_images_path, std::vector ref_latents, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* masked_latent = NULL) { + ggml_tensor* masked_latent = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1471,7 +1467,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, sd_ctx->sd->diffusion_model->get_adm_in_channels()); SDCondition uncond; - if (cfg_scale != 1.0 || sd_version_use_concat(sd_ctx->sd->version) && cfg_scale != guidance) { + if (guidance.txt_cfg != 1.0 || sd_version_use_concat(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg) { bool force_zero_embeddings = false; if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) { force_zero_embeddings = true; @@ -1569,6 +1565,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); } + // Disable min_cfg + guidance.min_cfg = guidance.txt_cfg; + struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, x_t, noise, @@ -1576,8 +1575,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, uncond, image_hint, control_strength, - cfg_scale, - cfg_scale, guidance, eta, sample_method, @@ -1585,10 +1582,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, start_merge_step, id_cond, ref_latents, - skip_layers, - slg_scale, - skip_layer_start, - skip_layer_end, noise_mask); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); @@ -1667,8 +1660,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, const char* prompt_c_str, const char* negative_prompt_c_str, int clip_skip, - float cfg_scale, - float guidance, + sd_guidance_params_t guidance, float eta, int width, int height, @@ -1680,13 +1672,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, float control_strength, float style_ratio, bool normalize_input, - const char* input_id_images_path_c_str, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { - std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); + const char* input_id_images_path_c_str) { LOG_DEBUG("txt2img %dx%d", width, height); if (sd_ctx == NULL) { return NULL; @@ -1731,7 +1717,6 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, prompt_c_str, negative_prompt_c_str, clip_skip, - cfg_scale, guidance, eta, width, @@ -1745,11 +1730,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, style_ratio, normalize_input, input_id_images_path_c_str, - {}, - skip_layers_vec, - slg_scale, - skip_layer_start, - skip_layer_end); + {}); size_t t1 = ggml_time_ms(); @@ -1764,8 +1745,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, const char* prompt_c_str, const char* negative_prompt_c_str, int clip_skip, - float cfg_scale, - float guidance, + sd_guidance_params_t guidance, float eta, int width, int height, @@ -1778,13 +1758,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, float control_strength, float style_ratio, bool normalize_input, - const char* input_id_images_path_c_str, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { - std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); + const char* input_id_images_path_c_str) { LOG_DEBUG("img2img %dx%d", width, height); if (sd_ctx == NULL) { return NULL; @@ -1834,7 +1808,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, ggml_tensor* init_moments = NULL; if (!sd_ctx->sd->use_tiny_autoencoder) { init_moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments); + init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments); } else { init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); } @@ -1927,7 +1901,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, prompt_c_str, negative_prompt_c_str, clip_skip, - cfg_scale, guidance, eta, width, @@ -1942,10 +1915,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, normalize_input, input_id_images_path_c_str, {}, - skip_layers_vec, - slg_scale, - skip_layer_start, - skip_layer_end, masked_latent); size_t t2 = ggml_time_ms(); @@ -1963,8 +1932,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, int motion_bucket_id, int fps, float augmentation_level, - float min_cfg, - float cfg_scale, + sd_guidance_params_t guidance, enum sample_method_t sample_method, int sample_steps, float strength, @@ -2041,9 +2009,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, uncond, {}, 0.f, - min_cfg, - cfg_scale, - 0.f, + guidance, 0.f, sample_method, sigmas, @@ -2094,26 +2060,19 @@ sd_image_t* edit(sd_ctx_t* sd_ctx, const char* prompt_c_str, const char* negative_prompt_c_str, int clip_skip, - float cfg_scale, - float guidance, + sd_guidance_params_t guidance, float eta, int width, int height, - sample_method_t sample_method, + enum sample_method_t sample_method, int sample_steps, - float strength, int64_t seed, int batch_count, const sd_image_t* control_cond, float control_strength, float style_ratio, bool normalize_input, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { - std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); + const char* input_id_images_path_c_str) { LOG_DEBUG("edit %dx%d", width, height); if (sd_ctx == NULL) { return NULL; @@ -2173,7 +2132,6 @@ sd_image_t* edit(sd_ctx_t* sd_ctx, prompt_c_str, negative_prompt_c_str, clip_skip, - cfg_scale, guidance, eta, width, @@ -2188,10 +2146,6 @@ sd_image_t* edit(sd_ctx_t* sd_ctx, normalize_input, "", ref_latents, - skip_layers_vec, - slg_scale, - skip_layer_start, - skip_layer_end, NULL); size_t t2 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index b4d6fc327..961899179 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -129,6 +129,21 @@ typedef struct { typedef struct sd_ctx_t sd_ctx_t; +typedef struct sd_slg_params_t { + int* layers; + size_t layer_count; + float layer_start; + float layer_end; + float scale; +} sd_slg_params_t; +typedef struct sd_guidance_params_t { + float txt_cfg; + float img_cfg; + float min_cfg; + float distilled_guidance; + sd_slg_params_t slg; +} sd_guidance_params_t; + SD_API sd_ctx_t* new_sd_ctx(const char* model_path, const char* clip_l_path, const char* clip_g_path, @@ -161,8 +176,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, const char* prompt, const char* negative_prompt, int clip_skip, - float cfg_scale, - float guidance, + sd_guidance_params_t guidance, float eta, int width, int height, @@ -174,12 +188,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, float control_strength, float style_strength, bool normalize_input, - const char* input_id_images_path, - int* skip_layers, - size_t skip_layers_count, - float slg_scale, - float skip_layer_start, - float skip_layer_end); + const char* input_id_images_path); SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, @@ -187,8 +196,7 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, const char* prompt, const char* negative_prompt, int clip_skip, - float cfg_scale, - float guidance, + sd_guidance_params_t guidance, float eta, int width, int height, @@ -201,12 +209,7 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, float control_strength, float style_strength, bool normalize_input, - const char* input_id_images_path, - int* skip_layers, - size_t skip_layers_count, - float slg_scale, - float skip_layer_start, - float skip_layer_end); + const char* input_id_images_path); SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, sd_image_t init_image, @@ -216,8 +219,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, int motion_bucket_id, int fps, float augmentation_level, - float min_cfg, - float cfg_scale, + sd_guidance_params_t guidance, enum sample_method_t sample_method, int sample_steps, float strength, @@ -229,25 +231,19 @@ SD_API sd_image_t* edit(sd_ctx_t* sd_ctx, const char* prompt, const char* negative_prompt, int clip_skip, - float cfg_scale, - float guidance, + sd_guidance_params_t guidance, float eta, int width, int height, enum sample_method_t sample_method, int sample_steps, - float strength, int64_t seed, int batch_count, const sd_image_t* control_cond, float control_strength, float style_strength, bool normalize_input, - int* skip_layers, - size_t skip_layers_count, - float slg_scale, - float skip_layer_start, - float skip_layer_end); + const char* input_id_images_path); typedef struct upscaler_ctx_t upscaler_ctx_t; From 63e637748d7d458e9239ec69e42314e114639eaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 7 Jul 2025 12:32:39 +0200 Subject: [PATCH 10/16] fix typo in help --- examples/cli/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 0490e59ba..1193bf202 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -234,7 +234,7 @@ void print_usage(int argc, const char* argv[]) { printf(" -p, --prompt [PROMPT] the prompt to render\n"); printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n"); printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n"); - printf(" --img_cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n"); + printf(" --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n"); printf(" --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)\n"); printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n"); printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n"); From d627ee9b4221d257ceff03358c51f2c708d0b0d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 7 Jul 2025 12:33:02 +0200 Subject: [PATCH 11/16] Support masks for ip2p models --- stable-diffusion.cpp | 85 +++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 4d7bdf522..1d4e36156 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -835,7 +835,7 @@ class StableDiffusionGGML { int start_merge_step, SDCondition id_cond, std::vector ref_latents = {}, - ggml_tensor* noise_mask = nullptr) { + ggml_tensor* denoise_mask = nullptr) { std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance) @@ -1055,10 +1055,10 @@ class StableDiffusionGGML { pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); } - if (noise_mask != nullptr) { + if (denoise_mask != nullptr) { for (int64_t x = 0; x < denoised->ne[0]; x++) { for (int64_t y = 0; y < denoised->ne[1]; y++) { - float mask = ggml_tensor_get_f32(noise_mask, x, y); + float mask = ggml_tensor_get_f32(denoise_mask, x, y); for (int64_t k = 0; k < denoised->ne[2]; k++) { float init = ggml_tensor_get_f32(init_latent, x, y, k); float den = ggml_tensor_get_f32(denoised, x, y, k); @@ -1319,7 +1319,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, bool normalize_input, std::string input_id_images_path, std::vector ref_latents, - ggml_tensor* masked_latent = NULL) { + ggml_tensor* concat_latent = NULL, + ggml_tensor* denoise_mask = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1506,7 +1507,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, int W = width / 8; int H = height / 8; LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); - ggml_tensor* noise_mask = nullptr; if (sd_version_is_inpaint(sd_ctx->sd->version)) { int64_t mask_channels = 1; if (sd_ctx->sd->version == VERSION_FLUX_FILL) { @@ -1532,21 +1532,22 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, } } } - if (masked_latent == NULL) { - masked_latent = empty_latent; + if (concat_latent == NULL) { + concat_latent = empty_latent; } - cond.c_concat = masked_latent; + cond.c_concat = concat_latent; uncond.c_concat = empty_latent; - // noise_mask = masked_latent; + denoise_mask = NULL; } else if (sd_version_is_edit(sd_ctx->sd->version)) { - cond.c_concat = masked_latent; - auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent->ne[0], masked_latent->ne[1], masked_latent->ne[2], masked_latent->ne[3]); + auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], init_latent->ne[3]); ggml_set_f32(empty_latent, 0); uncond.c_concat = empty_latent; - } else { - noise_mask = masked_latent; - } + if (concat_latent == NULL) { + concat_latent = empty_latent; + } + cond.c_concat = concat_latent; + } for (int b = 0; b < batch_count; b++) { int64_t sampling_start = ggml_time_ms(); int64_t cur_seed = seed + b; @@ -1582,7 +1583,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, start_merge_step, id_cond, ref_latents, - noise_mask); + denoise_mask); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1802,7 +1803,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_to_tensor(init_image.data, init_img); - ggml_tensor* masked_latent; + ggml_tensor* concat_latent; + ggml_tensor* denoise_mask = NULL; ggml_tensor* init_latent = NULL; ggml_tensor* init_moments = NULL; @@ -1822,22 +1824,22 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, // Restore init_img (encode_first_stage has side effects) TODO: remove the side effects? sd_image_to_tensor(init_image.data, init_img); sd_apply_mask(init_img, mask_img, masked_img); - ggml_tensor* masked_latent_0 = NULL; + ggml_tensor* masked_latent = NULL; if (!sd_ctx->sd->use_tiny_autoencoder) { ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); - masked_latent_0 = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + masked_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); } else { - masked_latent_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); + masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); } - masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent_0->ne[0], masked_latent_0->ne[1], mask_channels + masked_latent_0->ne[2], 1); - for (int ix = 0; ix < masked_latent_0->ne[0]; ix++) { - for (int iy = 0; iy < masked_latent_0->ne[1]; iy++) { + concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent->ne[0], masked_latent->ne[1], mask_channels + masked_latent->ne[2], 1); + for (int ix = 0; ix < masked_latent->ne[0]; ix++) { + for (int iy = 0; iy < masked_latent->ne[1]; iy++) { int mx = ix * 8; int my = iy * 8; if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - for (int k = 0; k < masked_latent_0->ne[2]; k++) { - float v = ggml_tensor_get_f32(masked_latent_0, ix, iy, k); - ggml_tensor_set_f32(masked_latent, v, ix, iy, k); + for (int k = 0; k < masked_latent->ne[2]; k++) { + float v = ggml_tensor_get_f32(masked_latent, ix, iy, k); + ggml_tensor_set_f32(concat_latent, v, ix, iy, k); } // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image for (int x = 0; x < 8; x++) { @@ -1845,40 +1847,42 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, float m = ggml_tensor_get_f32(mask_img, mx + x, my + y); // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?) // python code was using "b (h 8) (w 8) -> b (8 8) h w" - ggml_tensor_set_f32(masked_latent, m, ix, iy, masked_latent_0->ne[2] + x * 8 + y); + ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y); } } } else { float m = ggml_tensor_get_f32(mask_img, mx, my); - ggml_tensor_set_f32(masked_latent, m, ix, iy, 0); - for (int k = 0; k < masked_latent_0->ne[2]; k++) { - float v = ggml_tensor_get_f32(masked_latent_0, ix, iy, k); - ggml_tensor_set_f32(masked_latent, v, ix, iy, k + mask_channels); + ggml_tensor_set_f32(concat_latent, m, ix, iy, 0); + for (int k = 0; k < masked_latent->ne[2]; k++) { + float v = ggml_tensor_get_f32(masked_latent, ix, iy, k); + ggml_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels); } } } } } else if (sd_version_is_edit(sd_ctx->sd->version)) { - // Not actually masked, we're just highjacking the masked_latent variable since it will be used the same way + // Not actually masked, we're just highjacking the concat_latent variable since it will be used the same way if (!sd_ctx->sd->use_tiny_autoencoder) { if (sd_ctx->sd->is_using_edm_v_parameterization) { // for CosXL edit - masked_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments); + concat_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments); } else { - masked_latent = sd_ctx->sd->get_first_stage_encoding_mode(work_ctx, init_moments); + concat_latent = sd_ctx->sd->get_first_stage_encoding_mode(work_ctx, init_moments); } } else { - masked_latent = init_latent; + concat_latent = init_latent; } - } else { + } + + { // LOG_WARN("Inpainting with a base model is not great"); - masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1); - for (int ix = 0; ix < masked_latent->ne[0]; ix++) { - for (int iy = 0; iy < masked_latent->ne[1]; iy++) { + denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1); + for (int ix = 0; ix < denoise_mask->ne[0]; ix++) { + for (int iy = 0; iy < denoise_mask->ne[1]; iy++) { int mx = ix * 8; int my = iy * 8; float m = ggml_tensor_get_f32(mask_img, mx, my); - ggml_tensor_set_f32(masked_latent, m, ix, iy); + ggml_tensor_set_f32(denoise_mask, m, ix, iy); } } } @@ -1915,7 +1919,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, normalize_input, input_id_images_path_c_str, {}, - masked_latent); + concat_latent, + denoise_mask); size_t t2 = ggml_time_ms(); From 066744d192c022cc36fa918a825cb4aa20819d46 Mon Sep 17 00:00:00 2001 From: leejet Date: Fri, 11 Jul 2025 22:22:26 +0800 Subject: [PATCH 12/16] unify code style --- stable-diffusion.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/stable-diffusion.h b/stable-diffusion.h index 961899179..2aaf476c6 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -129,14 +129,15 @@ typedef struct { typedef struct sd_ctx_t sd_ctx_t; -typedef struct sd_slg_params_t { +typedef struct { int* layers; size_t layer_count; float layer_start; float layer_end; float scale; } sd_slg_params_t; -typedef struct sd_guidance_params_t { + +typedef struct { float txt_cfg; float img_cfg; float min_cfg; From 1802b997a29604f9a2b4cdab407502d3e7bebff8 Mon Sep 17 00:00:00 2001 From: leejet Date: Fri, 11 Jul 2025 22:46:39 +0800 Subject: [PATCH 13/16] delete unused code --- stable-diffusion.cpp | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 1d4e36156..44d6e32d2 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1115,30 +1115,6 @@ class StableDiffusionGGML { return latent; } - ggml_tensor* - get_first_stage_encoding_mode(ggml_context* work_ctx, ggml_tensor* moments) { - // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample - ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); - ggml_tensor_set_f32_randn(noise, rng); - // noise = load_tensor_from_file(work_ctx, "noise.bin"); - { - float mean = 0; - for (int i = 0; i < latent->ne[3]; i++) { - for (int j = 0; j < latent->ne[2]; j++) { - for (int k = 0; k < latent->ne[1]; k++) { - for (int l = 0; l < latent->ne[0]; l++) { - // mode and mean are the same for gaussians - mean = ggml_tensor_get_f32(moments, l, k, j, i); - ggml_tensor_set_f32(latent, mean, l, k, j, i); - } - } - } - } - } - return latent; - } - ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) { int64_t W = x->ne[0]; int64_t H = x->ne[1]; @@ -1867,7 +1843,14 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, // for CosXL edit concat_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments); } else { - concat_latent = sd_ctx->sd->get_first_stage_encoding_mode(work_ctx, init_moments); + concat_latent = ggml_view_3d(work_ctx, + init_moments, + init_moments->ne[0], + init_moments->ne[1], + init_moments->ne[2] / 2, + init_moments->nb[1], + init_moments->nb[2], + 0); } } else { concat_latent = init_latent; @@ -1887,7 +1870,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } } - print_ggml_tensor(init_latent, true); size_t t1 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); From 540c49ac7705bf942a87b212c5b643b6145cea41 Mon Sep 17 00:00:00 2001 From: leejet Date: Fri, 11 Jul 2025 23:27:00 +0800 Subject: [PATCH 14/16] use edit mode --- model.h | 4 +-- stable-diffusion.cpp | 63 ++++++++++++++++++-------------------------- unet.hpp | 2 +- 3 files changed, 28 insertions(+), 41 deletions(-) diff --git a/model.h b/model.h index 66037e4e3..92de8edfd 100644 --- a/model.h +++ b/model.h @@ -83,12 +83,12 @@ static inline bool sd_version_is_dit(SDVersion version) { return false; } -static inline bool sd_version_is_edit(SDVersion version) { +static inline bool sd_version_is_unet_edit(SDVersion version) { return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX; } static bool sd_version_use_concat(SDVersion version) { - return sd_version_is_edit(version) || sd_version_is_inpaint(version); + return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version); } enum PMVersion { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 44d6e32d2..6ffe26127 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1514,15 +1514,14 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, cond.c_concat = concat_latent; uncond.c_concat = empty_latent; denoise_mask = NULL; - } else if (sd_version_is_edit(sd_ctx->sd->version)) { - auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], init_latent->ne[3]); + } else if (sd_version_is_unet_edit(sd_ctx->sd->version)) { + auto empty_latent = ggml_dup_tensor(work_ctx, init_latent); ggml_set_f32(empty_latent, 0); uncond.c_concat = empty_latent; if (concat_latent == NULL) { concat_latent = empty_latent; } - cond.c_concat = concat_latent; - + cond.c_concat = ref_latents[0]; } for (int b = 0; b < batch_count; b++) { int64_t sampling_start = ggml_time_ms(); @@ -1782,23 +1781,12 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, ggml_tensor* concat_latent; ggml_tensor* denoise_mask = NULL; - ggml_tensor* init_latent = NULL; - ggml_tensor* init_moments = NULL; - if (!sd_ctx->sd->use_tiny_autoencoder) { - init_moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments); - } else { - init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - } - if (sd_version_is_inpaint(sd_ctx->sd->version)) { int64_t mask_channels = 1; if (sd_ctx->sd->version == VERSION_FLUX_FILL) { mask_channels = 8 * 8; // flatten the whole mask } ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - // Restore init_img (encode_first_stage has side effects) TODO: remove the side effects? - sd_image_to_tensor(init_image.data, init_img); sd_apply_mask(init_img, mask_img, masked_img); ggml_tensor* masked_latent = NULL; if (!sd_ctx->sd->use_tiny_autoencoder) { @@ -1836,26 +1824,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } } } - } else if (sd_version_is_edit(sd_ctx->sd->version)) { - // Not actually masked, we're just highjacking the concat_latent variable since it will be used the same way - if (!sd_ctx->sd->use_tiny_autoencoder) { - if (sd_ctx->sd->is_using_edm_v_parameterization) { - // for CosXL edit - concat_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments); - } else { - concat_latent = ggml_view_3d(work_ctx, - init_moments, - init_moments->ne[0], - init_moments->ne[1], - init_moments->ne[2] / 2, - init_moments->nb[1], - init_moments->nb[2], - 0); - } - } else { - concat_latent = init_latent; - } - } + } { // LOG_WARN("Inpainting with a base model is not great"); @@ -1870,6 +1839,14 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } } + ggml_tensor* init_latent = NULL; + if (!sd_ctx->sd->use_tiny_autoencoder) { + ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + } else { + init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + } + size_t t1 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); @@ -2097,11 +2074,21 @@ sd_image_t* edit(sd_ctx_t* sd_ctx, sd_image_to_tensor(ref_images[i].data, img); ggml_tensor* latent = NULL; - if (!sd_ctx->sd->use_tiny_autoencoder) { + if (sd_ctx->sd->use_tiny_autoencoder) { + latent = sd_ctx->sd->encode_first_stage(work_ctx, img); + } else if (sd_ctx->sd->version == VERSION_SD1_PIX2PIX) { + latent = sd_ctx->sd->encode_first_stage(work_ctx, img); + latent = ggml_view_3d(work_ctx, + latent, + latent->ne[0], + latent->ne[1], + latent->ne[2] / 2, + latent->nb[1], + latent->nb[2], + 0); + } else { ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img); latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); - } else { - latent = sd_ctx->sd->encode_first_stage(work_ctx, img); } ref_latents.push_back(latent); } diff --git a/unet.hpp b/unet.hpp index b3fae53a4..9193dcd67 100644 --- a/unet.hpp +++ b/unet.hpp @@ -207,7 +207,7 @@ class UnetModelBlock : public GGMLBlock { } if (sd_version_is_inpaint(version)) { in_channels = 9; - } else if (sd_version_is_edit(version)) { + } else if (sd_version_is_unet_edit(version)) { in_channels = 8; } From df1a6ffcaf00d1322511e6e0c2e315218f795eed Mon Sep 17 00:00:00 2001 From: leejet Date: Fri, 11 Jul 2025 23:59:09 +0800 Subject: [PATCH 15/16] add img_cond --- model.h | 2 +- stable-diffusion.cpp | 32 ++++++++++++++++++++------------ 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/model.h b/model.h index 92de8edfd..1cb3ba2f6 100644 --- a/model.h +++ b/model.h @@ -87,7 +87,7 @@ static inline bool sd_version_is_unet_edit(SDVersion version) { return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX; } -static bool sd_version_use_concat(SDVersion version) { +static bool sd_version_is_inpaint_or_unet_edit(SDVersion version) { return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 6ffe26127..3083db7cc 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -826,6 +826,7 @@ class StableDiffusionGGML { ggml_tensor* noise, SDCondition cond, SDCondition uncond, + SDCondition img_cond, ggml_tensor* control_hint, float control_strength, sd_guidance_params_t guidance, @@ -838,15 +839,13 @@ class StableDiffusionGGML { ggml_tensor* denoise_mask = nullptr) { std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); - // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance) - float cfg_scale = guidance.txt_cfg; float img_cfg_scale = guidance.img_cfg; float slg_scale = guidance.slg.scale; float min_cfg = guidance.min_cfg; - if (img_cfg_scale != cfg_scale && !sd_version_use_concat(version)) { + if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) { LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance..."); img_cfg_scale = cfg_scale; } @@ -873,7 +872,7 @@ class StableDiffusionGGML { struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise); bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != NULL; - bool has_img_guidance = cfg_scale != img_cfg_scale && uncond.c_crossattn != NULL; + bool has_img_cond = cfg_scale != img_cfg_scale && img_cond.c_crossattn != NULL; bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; // denoise wrapper @@ -893,7 +892,7 @@ class StableDiffusionGGML { LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]); } } - if (has_img_guidance) { + if (has_img_cond) { out_img_cond = ggml_dup_tensor(work_ctx, x); } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); @@ -981,13 +980,13 @@ class StableDiffusionGGML { } float* img_cond_data = NULL; - if (has_img_guidance) { + if (has_img_cond) { diffusion_model->compute(n_threads, noised_input, timesteps, - uncond.c_crossattn, - cond.c_concat, - uncond.c_vector, + img_cond.c_crossattn, + img_cond.c_concat, + img_cond.c_vector, guidance_tensor, ref_latents, -1, @@ -1032,14 +1031,15 @@ class StableDiffusionGGML { int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2]; float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3); } else { - if (has_img_guidance) { + if (has_img_cond) { + // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond) latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]); } else { // img_cfg_scale == cfg_scale latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); } } - } else if (has_img_guidance) { + } else if (has_img_cond) { // img_cfg_scale == 1 latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]); } @@ -1444,7 +1444,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, sd_ctx->sd->diffusion_model->get_adm_in_channels()); SDCondition uncond; - if (guidance.txt_cfg != 1.0 || sd_version_use_concat(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg) { + if (guidance.txt_cfg != 1.0 || \ + (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) { bool force_zero_embeddings = false; if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) { force_zero_embeddings = true; @@ -1523,6 +1524,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, } cond.c_concat = ref_latents[0]; } + SDCondition img_cond; + if (uncond.c_crossattn != NULL && \ + (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) { + img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat); + } for (int b = 0; b < batch_count; b++) { int64_t sampling_start = ggml_time_ms(); int64_t cur_seed = seed + b; @@ -1549,6 +1555,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, noise, cond, uncond, + img_cond, image_hint, control_strength, guidance, @@ -1972,6 +1979,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, cond, uncond, {}, + {}, 0.f, guidance, 0.f, From 7ada721fdee03dbbd7e6d2746057833ae5e0f39e Mon Sep 17 00:00:00 2001 From: leejet Date: Sat, 12 Jul 2025 15:28:27 +0800 Subject: [PATCH 16/16] format code --- model.cpp | 9 ++++----- stable-diffusion.cpp | 12 ++++++------ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/model.cpp b/model.cpp index afc5b627e..d3a380ae6 100644 --- a/model.cpp +++ b/model.cpp @@ -100,7 +100,7 @@ const char* unused_tensors[] = { "model_ema.diffusion_model", "embedding_manager", "denoiser.sigmas", - "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training + "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training }; bool is_unused_tensor(std::string name) { @@ -1169,7 +1169,6 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const n_dims = 1; } - TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin); tensor_storage.reverse_ne(); @@ -1921,7 +1920,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend }; int tensor_count = 0; int64_t t1 = ggml_time_ms(); - bool partial = false; + bool partial = false; for (auto& tensor_storage : processed_tensor_storages) { if (tensor_storage.file_index != file_index) { ++tensor_count; @@ -2004,9 +2003,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } } size_t tensor_max = processed_tensor_storages.size(); - int64_t t2 = ggml_time_ms(); + int64_t t2 = ggml_time_ms(); pretty_progress(++tensor_count, tensor_max, (t2 - t1) / 1000.0f); - t1 = t2; + t1 = t2; partial = tensor_count != tensor_max; } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 3083db7cc..c6b873fad 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1444,7 +1444,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, sd_ctx->sd->diffusion_model->get_adm_in_channels()); SDCondition uncond; - if (guidance.txt_cfg != 1.0 || \ + if (guidance.txt_cfg != 1.0 || (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) { bool force_zero_embeddings = false; if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) { @@ -1522,11 +1522,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, if (concat_latent == NULL) { concat_latent = empty_latent; } - cond.c_concat = ref_latents[0]; + cond.c_concat = ref_latents[0]; } SDCondition img_cond; - if (uncond.c_crossattn != NULL && \ - (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) { + if (uncond.c_crossattn != NULL && + (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) { img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat); } for (int b = 0; b < batch_count; b++) { @@ -1798,7 +1798,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, ggml_tensor* masked_latent = NULL; if (!sd_ctx->sd->use_tiny_autoencoder) { ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); - masked_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + masked_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); } else { masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); } @@ -1832,7 +1832,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } } } - + { // LOG_WARN("Inpainting with a base model is not great"); denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);