Skip to content

Commit b9f5a5b

Browse files
committed
Support masks for ip2p models
1 parent b20ccb6 commit b9f5a5b

File tree

1 file changed

+45
-41
lines changed

1 file changed

+45
-41
lines changed

stable-diffusion.cpp

Lines changed: 45 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -810,8 +810,7 @@ class StableDiffusionGGML {
810810
const std::vector<float>& sigmas,
811811
int start_merge_step,
812812
SDCondition id_cond,
813-
ggml_tensor* noise_mask = nullptr) {
814-
813+
ggml_tensor* denoise_mask = NULL) {
815814
std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
816815

817816
// TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
@@ -1026,10 +1025,10 @@ class StableDiffusionGGML {
10261025
pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
10271026
// LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
10281027
}
1029-
if (noise_mask != nullptr) {
1028+
if (denoise_mask != nullptr) {
10301029
for (int64_t x = 0; x < denoised->ne[0]; x++) {
10311030
for (int64_t y = 0; y < denoised->ne[1]; y++) {
1032-
float mask = ggml_tensor_get_f32(noise_mask, x, y);
1031+
float mask = ggml_tensor_get_f32(denoise_mask, x, y);
10331032
for (int64_t k = 0; k < denoised->ne[2]; k++) {
10341033
float init = ggml_tensor_get_f32(init_latent, x, y, k);
10351034
float den = ggml_tensor_get_f32(denoised, x, y, k);
@@ -1283,7 +1282,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
12831282
float style_ratio,
12841283
bool normalize_input,
12851284
std::string input_id_images_path,
1286-
ggml_tensor* masked_latent = NULL) {
1285+
ggml_tensor* concat_latent = NULL,
1286+
ggml_tensor* denoise_mask = NULL) {
12871287
if (seed < 0) {
12881288
// Generally, when using the provided command line, the seed is always >0.
12891289
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1470,7 +1470,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
14701470
int W = width / 8;
14711471
int H = height / 8;
14721472
LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
1473-
ggml_tensor* noise_mask = nullptr;
14741473
if (sd_version_is_inpaint(sd_ctx->sd->version)) {
14751474
int64_t mask_channels = 1;
14761475
if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
@@ -1496,21 +1495,22 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
14961495
}
14971496
}
14981497
}
1499-
if (masked_latent == NULL) {
1500-
masked_latent = empty_latent;
1498+
if (concat_latent == NULL) {
1499+
concat_latent = empty_latent;
15011500
}
1502-
cond.c_concat = masked_latent;
1501+
cond.c_concat = concat_latent;
15031502
uncond.c_concat = empty_latent;
1504-
// noise_mask = masked_latent;
1503+
denoise_mask = NULL;
15051504
} else if (sd_version_is_edit(sd_ctx->sd->version)) {
1506-
cond.c_concat = masked_latent;
1507-
auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent->ne[0], masked_latent->ne[1], masked_latent->ne[2], masked_latent->ne[3]);
1505+
auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], init_latent->ne[3]);
15081506
ggml_set_f32(empty_latent, 0);
15091507
uncond.c_concat = empty_latent;
1510-
} else {
1511-
noise_mask = masked_latent;
1512-
}
1508+
if (concat_latent == NULL) {
1509+
concat_latent = empty_latent;
1510+
}
1511+
cond.c_concat = concat_latent;
15131512

1513+
}
15141514
for (int b = 0; b < batch_count; b++) {
15151515
int64_t sampling_start = ggml_time_ms();
15161516
int64_t cur_seed = seed + b;
@@ -1545,7 +1545,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
15451545
sigmas,
15461546
start_merge_step,
15471547
id_cond,
1548-
noise_mask);
1548+
denoise_mask);
15491549

15501550
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
15511551
// print_ggml_tensor(x_0);
@@ -1756,7 +1756,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
17561756

17571757
sd_image_to_tensor(init_image.data, init_img);
17581758

1759-
ggml_tensor* masked_latent;
1759+
ggml_tensor* concat_latent;
1760+
ggml_tensor* denoise_mask = NULL;
17601761

17611762
ggml_tensor* init_latent = NULL;
17621763
ggml_tensor* init_moments = NULL;
@@ -1776,63 +1777,65 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
17761777
// Restore init_img (encode_first_stage has side effects) TODO: remove the side effects?
17771778
sd_image_to_tensor(init_image.data, init_img);
17781779
sd_apply_mask(init_img, mask_img, masked_img);
1779-
ggml_tensor* masked_latent_0 = NULL;
1780+
ggml_tensor* masked_latent = NULL;
17801781
if (!sd_ctx->sd->use_tiny_autoencoder) {
17811782
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
1782-
masked_latent_0 = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
1783+
masked_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
17831784
} else {
1784-
masked_latent_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
1785+
masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
17851786
}
1786-
masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent_0->ne[0], masked_latent_0->ne[1], mask_channels + masked_latent_0->ne[2], 1);
1787-
for (int ix = 0; ix < masked_latent_0->ne[0]; ix++) {
1788-
for (int iy = 0; iy < masked_latent_0->ne[1]; iy++) {
1787+
concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent->ne[0], masked_latent->ne[1], mask_channels + masked_latent->ne[2], 1);
1788+
for (int ix = 0; ix < masked_latent->ne[0]; ix++) {
1789+
for (int iy = 0; iy < masked_latent->ne[1]; iy++) {
17891790
int mx = ix * 8;
17901791
int my = iy * 8;
17911792
if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
1792-
for (int k = 0; k < masked_latent_0->ne[2]; k++) {
1793-
float v = ggml_tensor_get_f32(masked_latent_0, ix, iy, k);
1794-
ggml_tensor_set_f32(masked_latent, v, ix, iy, k);
1793+
for (int k = 0; k < masked_latent->ne[2]; k++) {
1794+
float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
1795+
ggml_tensor_set_f32(concat_latent, v, ix, iy, k);
17951796
}
17961797
// "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
17971798
for (int x = 0; x < 8; x++) {
17981799
for (int y = 0; y < 8; y++) {
17991800
float m = ggml_tensor_get_f32(mask_img, mx + x, my + y);
18001801
// TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?)
18011802
// python code was using "b (h 8) (w 8) -> b (8 8) h w"
1802-
ggml_tensor_set_f32(masked_latent, m, ix, iy, masked_latent_0->ne[2] + x * 8 + y);
1803+
ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y);
18031804
}
18041805
}
18051806
} else {
18061807
float m = ggml_tensor_get_f32(mask_img, mx, my);
1807-
ggml_tensor_set_f32(masked_latent, m, ix, iy, 0);
1808-
for (int k = 0; k < masked_latent_0->ne[2]; k++) {
1809-
float v = ggml_tensor_get_f32(masked_latent_0, ix, iy, k);
1810-
ggml_tensor_set_f32(masked_latent, v, ix, iy, k + mask_channels);
1808+
ggml_tensor_set_f32(concat_latent, m, ix, iy, 0);
1809+
for (int k = 0; k < masked_latent->ne[2]; k++) {
1810+
float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
1811+
ggml_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels);
18111812
}
18121813
}
18131814
}
18141815
}
18151816
} else if (sd_version_is_edit(sd_ctx->sd->version)) {
1816-
// Not actually masked, we're just highjacking the masked_latent variable since it will be used the same way
1817+
// Not actually masked, we're just highjacking the concat_latent variable since it will be used the same way
18171818
if (!sd_ctx->sd->use_tiny_autoencoder) {
18181819
if (sd_ctx->sd->is_using_edm_v_parameterization) {
18191820
// for CosXL edit
1820-
masked_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments);
1821+
concat_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, init_moments);
18211822
} else {
1822-
masked_latent = sd_ctx->sd->get_first_stage_encoding_mode(work_ctx, init_moments);
1823+
concat_latent = sd_ctx->sd->get_first_stage_encoding_mode(work_ctx, init_moments);
18231824
}
18241825
} else {
1825-
masked_latent = init_latent;
1826+
concat_latent = init_latent;
18261827
}
1827-
} else {
1828+
}
1829+
1830+
{
18281831
// LOG_WARN("Inpainting with a base model is not great");
1829-
masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
1830-
for (int ix = 0; ix < masked_latent->ne[0]; ix++) {
1831-
for (int iy = 0; iy < masked_latent->ne[1]; iy++) {
1832+
denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
1833+
for (int ix = 0; ix < denoise_mask->ne[0]; ix++) {
1834+
for (int iy = 0; iy < denoise_mask->ne[1]; iy++) {
18321835
int mx = ix * 8;
18331836
int my = iy * 8;
18341837
float m = ggml_tensor_get_f32(mask_img, mx, my);
1835-
ggml_tensor_set_f32(masked_latent, m, ix, iy);
1838+
ggml_tensor_set_f32(denoise_mask, m, ix, iy);
18361839
}
18371840
}
18381841
}
@@ -1868,7 +1871,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
18681871
style_ratio,
18691872
normalize_input,
18701873
input_id_images_path_c_str,
1871-
masked_latent);
1874+
concat_latent,
1875+
denoise_mask);
18721876

18731877
size_t t2 = ggml_time_ms();
18741878

0 commit comments

Comments
 (0)