@@ -835,7 +835,7 @@ class StableDiffusionGGML {
835
835
int start_merge_step,
836
836
SDCondition id_cond,
837
837
std::vector<ggml_tensor*> ref_latents = {},
838
- ggml_tensor* noise_mask = nullptr ) {
838
+ ggml_tensor* denoise_mask = nullptr ) {
839
839
std::vector<int > skip_layers (guidance.slg .layers , guidance.slg .layers + guidance.slg .layer_count );
840
840
841
841
// TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
@@ -1055,10 +1055,10 @@ class StableDiffusionGGML {
1055
1055
pretty_progress (step, (int )steps, (t1 - t0) / 1000000 .f );
1056
1056
// LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
1057
1057
}
1058
- if (noise_mask != nullptr ) {
1058
+ if (denoise_mask != nullptr ) {
1059
1059
for (int64_t x = 0 ; x < denoised->ne [0 ]; x++) {
1060
1060
for (int64_t y = 0 ; y < denoised->ne [1 ]; y++) {
1061
- float mask = ggml_tensor_get_f32 (noise_mask , x, y);
1061
+ float mask = ggml_tensor_get_f32 (denoise_mask , x, y);
1062
1062
for (int64_t k = 0 ; k < denoised->ne [2 ]; k++) {
1063
1063
float init = ggml_tensor_get_f32 (init_latent, x, y, k);
1064
1064
float den = ggml_tensor_get_f32 (denoised, x, y, k);
@@ -1319,7 +1319,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1319
1319
bool normalize_input,
1320
1320
std::string input_id_images_path,
1321
1321
std::vector<ggml_tensor*> ref_latents,
1322
- ggml_tensor* masked_latent = NULL ) {
1322
+ ggml_tensor* concat_latent = NULL ,
1323
+ ggml_tensor* denoise_mask = NULL ) {
1323
1324
if (seed < 0 ) {
1324
1325
// Generally, when using the provided command line, the seed is always >0.
1325
1326
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1506,7 +1507,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1506
1507
int W = width / 8 ;
1507
1508
int H = height / 8 ;
1508
1509
LOG_INFO (" sampling using %s method" , sampling_methods_str[sample_method]);
1509
- ggml_tensor* noise_mask = nullptr ;
1510
1510
if (sd_version_is_inpaint (sd_ctx->sd ->version )) {
1511
1511
int64_t mask_channels = 1 ;
1512
1512
if (sd_ctx->sd ->version == VERSION_FLUX_FILL) {
@@ -1532,21 +1532,22 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1532
1532
}
1533
1533
}
1534
1534
}
1535
- if (masked_latent == NULL ) {
1536
- masked_latent = empty_latent;
1535
+ if (concat_latent == NULL ) {
1536
+ concat_latent = empty_latent;
1537
1537
}
1538
- cond.c_concat = masked_latent ;
1538
+ cond.c_concat = concat_latent ;
1539
1539
uncond.c_concat = empty_latent;
1540
- // noise_mask = masked_latent ;
1540
+ denoise_mask = NULL ;
1541
1541
} else if (sd_version_is_edit (sd_ctx->sd ->version )) {
1542
- cond.c_concat = masked_latent;
1543
- auto empty_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, masked_latent->ne [0 ], masked_latent->ne [1 ], masked_latent->ne [2 ], masked_latent->ne [3 ]);
1542
+ auto empty_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, init_latent->ne [0 ], init_latent->ne [1 ], init_latent->ne [2 ], init_latent->ne [3 ]);
1544
1543
ggml_set_f32 (empty_latent, 0 );
1545
1544
uncond.c_concat = empty_latent;
1546
- } else {
1547
- noise_mask = masked_latent;
1548
- }
1545
+ if (concat_latent == NULL ) {
1546
+ concat_latent = empty_latent;
1547
+ }
1548
+ cond.c_concat = concat_latent;
1549
1549
1550
+ }
1550
1551
for (int b = 0 ; b < batch_count; b++) {
1551
1552
int64_t sampling_start = ggml_time_ms ();
1552
1553
int64_t cur_seed = seed + b;
@@ -1582,7 +1583,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1582
1583
start_merge_step,
1583
1584
id_cond,
1584
1585
ref_latents,
1585
- noise_mask );
1586
+ denoise_mask );
1586
1587
1587
1588
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
1588
1589
// print_ggml_tensor(x_0);
@@ -1802,7 +1803,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
1802
1803
1803
1804
sd_image_to_tensor (init_image.data , init_img);
1804
1805
1805
- ggml_tensor* masked_latent;
1806
+ ggml_tensor* concat_latent;
1807
+ ggml_tensor* denoise_mask = NULL ;
1806
1808
1807
1809
ggml_tensor* init_latent = NULL ;
1808
1810
ggml_tensor* init_moments = NULL ;
@@ -1822,63 +1824,65 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
1822
1824
// Restore init_img (encode_first_stage has side effects) TODO: remove the side effects?
1823
1825
sd_image_to_tensor (init_image.data , init_img);
1824
1826
sd_apply_mask (init_img, mask_img, masked_img);
1825
- ggml_tensor* masked_latent_0 = NULL ;
1827
+ ggml_tensor* masked_latent = NULL ;
1826
1828
if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1827
1829
ggml_tensor* moments = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1828
- masked_latent_0 = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1830
+ masked_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1829
1831
} else {
1830
- masked_latent_0 = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1832
+ masked_latent = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1831
1833
}
1832
- masked_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, masked_latent_0 ->ne [0 ], masked_latent_0 ->ne [1 ], mask_channels + masked_latent_0 ->ne [2 ], 1 );
1833
- for (int ix = 0 ; ix < masked_latent_0 ->ne [0 ]; ix++) {
1834
- for (int iy = 0 ; iy < masked_latent_0 ->ne [1 ]; iy++) {
1834
+ concat_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, masked_latent ->ne [0 ], masked_latent ->ne [1 ], mask_channels + masked_latent ->ne [2 ], 1 );
1835
+ for (int ix = 0 ; ix < masked_latent ->ne [0 ]; ix++) {
1836
+ for (int iy = 0 ; iy < masked_latent ->ne [1 ]; iy++) {
1835
1837
int mx = ix * 8 ;
1836
1838
int my = iy * 8 ;
1837
1839
if (sd_ctx->sd ->version == VERSION_FLUX_FILL) {
1838
- for (int k = 0 ; k < masked_latent_0 ->ne [2 ]; k++) {
1839
- float v = ggml_tensor_get_f32 (masked_latent_0 , ix, iy, k);
1840
- ggml_tensor_set_f32 (masked_latent , v, ix, iy, k);
1840
+ for (int k = 0 ; k < masked_latent ->ne [2 ]; k++) {
1841
+ float v = ggml_tensor_get_f32 (masked_latent , ix, iy, k);
1842
+ ggml_tensor_set_f32 (concat_latent , v, ix, iy, k);
1841
1843
}
1842
1844
// "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
1843
1845
for (int x = 0 ; x < 8 ; x++) {
1844
1846
for (int y = 0 ; y < 8 ; y++) {
1845
1847
float m = ggml_tensor_get_f32 (mask_img, mx + x, my + y);
1846
1848
// TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?)
1847
1849
// python code was using "b (h 8) (w 8) -> b (8 8) h w"
1848
- ggml_tensor_set_f32 (masked_latent , m, ix, iy, masked_latent_0 ->ne [2 ] + x * 8 + y);
1850
+ ggml_tensor_set_f32 (concat_latent , m, ix, iy, masked_latent ->ne [2 ] + x * 8 + y);
1849
1851
}
1850
1852
}
1851
1853
} else {
1852
1854
float m = ggml_tensor_get_f32 (mask_img, mx, my);
1853
- ggml_tensor_set_f32 (masked_latent , m, ix, iy, 0 );
1854
- for (int k = 0 ; k < masked_latent_0 ->ne [2 ]; k++) {
1855
- float v = ggml_tensor_get_f32 (masked_latent_0 , ix, iy, k);
1856
- ggml_tensor_set_f32 (masked_latent , v, ix, iy, k + mask_channels);
1855
+ ggml_tensor_set_f32 (concat_latent , m, ix, iy, 0 );
1856
+ for (int k = 0 ; k < masked_latent ->ne [2 ]; k++) {
1857
+ float v = ggml_tensor_get_f32 (masked_latent , ix, iy, k);
1858
+ ggml_tensor_set_f32 (concat_latent , v, ix, iy, k + mask_channels);
1857
1859
}
1858
1860
}
1859
1861
}
1860
1862
}
1861
1863
} else if (sd_version_is_edit (sd_ctx->sd ->version )) {
1862
- // Not actually masked, we're just highjacking the masked_latent variable since it will be used the same way
1864
+ // Not actually masked, we're just highjacking the concat_latent variable since it will be used the same way
1863
1865
if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1864
1866
if (sd_ctx->sd ->is_using_edm_v_parameterization ) {
1865
1867
// for CosXL edit
1866
- masked_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, init_moments);
1868
+ concat_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, init_moments);
1867
1869
} else {
1868
- masked_latent = sd_ctx->sd ->get_first_stage_encoding_mode (work_ctx, init_moments);
1870
+ concat_latent = sd_ctx->sd ->get_first_stage_encoding_mode (work_ctx, init_moments);
1869
1871
}
1870
1872
} else {
1871
- masked_latent = init_latent;
1873
+ concat_latent = init_latent;
1872
1874
}
1873
- } else {
1875
+ }
1876
+
1877
+ {
1874
1878
// LOG_WARN("Inpainting with a base model is not great");
1875
- masked_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, width / 8 , height / 8 , 1 , 1 );
1876
- for (int ix = 0 ; ix < masked_latent ->ne [0 ]; ix++) {
1877
- for (int iy = 0 ; iy < masked_latent ->ne [1 ]; iy++) {
1879
+ denoise_mask = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, width / 8 , height / 8 , 1 , 1 );
1880
+ for (int ix = 0 ; ix < denoise_mask ->ne [0 ]; ix++) {
1881
+ for (int iy = 0 ; iy < denoise_mask ->ne [1 ]; iy++) {
1878
1882
int mx = ix * 8 ;
1879
1883
int my = iy * 8 ;
1880
1884
float m = ggml_tensor_get_f32 (mask_img, mx, my);
1881
- ggml_tensor_set_f32 (masked_latent , m, ix, iy);
1885
+ ggml_tensor_set_f32 (denoise_mask , m, ix, iy);
1882
1886
}
1883
1887
}
1884
1888
}
@@ -1915,7 +1919,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
1915
1919
normalize_input,
1916
1920
input_id_images_path_c_str,
1917
1921
{},
1918
- masked_latent);
1922
+ concat_latent,
1923
+ denoise_mask);
1919
1924
1920
1925
size_t t2 = ggml_time_ms ();
1921
1926
0 commit comments