@@ -810,8 +810,7 @@ class StableDiffusionGGML {
810
810
const std::vector<float >& sigmas,
811
811
int start_merge_step,
812
812
SDCondition id_cond,
813
- ggml_tensor* noise_mask = nullptr ) {
814
-
813
+ ggml_tensor* denoise_mask = NULL ) {
815
814
std::vector<int > skip_layers (guidance.slg .layers , guidance.slg .layers + guidance.slg .layer_count );
816
815
817
816
// TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
@@ -1026,10 +1025,10 @@ class StableDiffusionGGML {
1026
1025
pretty_progress (step, (int )steps, (t1 - t0) / 1000000 .f );
1027
1026
// LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
1028
1027
}
1029
- if (noise_mask != nullptr ) {
1028
+ if (denoise_mask != nullptr ) {
1030
1029
for (int64_t x = 0 ; x < denoised->ne [0 ]; x++) {
1031
1030
for (int64_t y = 0 ; y < denoised->ne [1 ]; y++) {
1032
- float mask = ggml_tensor_get_f32 (noise_mask , x, y);
1031
+ float mask = ggml_tensor_get_f32 (denoise_mask , x, y);
1033
1032
for (int64_t k = 0 ; k < denoised->ne [2 ]; k++) {
1034
1033
float init = ggml_tensor_get_f32 (init_latent, x, y, k);
1035
1034
float den = ggml_tensor_get_f32 (denoised, x, y, k);
@@ -1283,7 +1282,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1283
1282
float style_ratio,
1284
1283
bool normalize_input,
1285
1284
std::string input_id_images_path,
1286
- ggml_tensor* masked_latent = NULL ) {
1285
+ ggml_tensor* concat_latent = NULL ,
1286
+ ggml_tensor* denoise_mask = NULL ) {
1287
1287
if (seed < 0 ) {
1288
1288
// Generally, when using the provided command line, the seed is always >0.
1289
1289
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1470,7 +1470,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1470
1470
int W = width / 8 ;
1471
1471
int H = height / 8 ;
1472
1472
LOG_INFO (" sampling using %s method" , sampling_methods_str[sample_method]);
1473
- ggml_tensor* noise_mask = nullptr ;
1474
1473
if (sd_version_is_inpaint (sd_ctx->sd ->version )) {
1475
1474
int64_t mask_channels = 1 ;
1476
1475
if (sd_ctx->sd ->version == VERSION_FLUX_FILL) {
@@ -1496,21 +1495,22 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1496
1495
}
1497
1496
}
1498
1497
}
1499
- if (masked_latent == NULL ) {
1500
- masked_latent = empty_latent;
1498
+ if (concat_latent == NULL ) {
1499
+ concat_latent = empty_latent;
1501
1500
}
1502
- cond.c_concat = masked_latent ;
1501
+ cond.c_concat = concat_latent ;
1503
1502
uncond.c_concat = empty_latent;
1504
- // noise_mask = masked_latent ;
1503
+ denoise_mask = NULL ;
1505
1504
} else if (sd_version_is_edit (sd_ctx->sd ->version )) {
1506
- cond.c_concat = masked_latent;
1507
- auto empty_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, masked_latent->ne [0 ], masked_latent->ne [1 ], masked_latent->ne [2 ], masked_latent->ne [3 ]);
1505
+ auto empty_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, init_latent->ne [0 ], init_latent->ne [1 ], init_latent->ne [2 ], init_latent->ne [3 ]);
1508
1506
ggml_set_f32 (empty_latent, 0 );
1509
1507
uncond.c_concat = empty_latent;
1510
- } else {
1511
- noise_mask = masked_latent;
1512
- }
1508
+ if (concat_latent == NULL ) {
1509
+ concat_latent = empty_latent;
1510
+ }
1511
+ cond.c_concat = concat_latent;
1513
1512
1513
+ }
1514
1514
for (int b = 0 ; b < batch_count; b++) {
1515
1515
int64_t sampling_start = ggml_time_ms ();
1516
1516
int64_t cur_seed = seed + b;
@@ -1545,7 +1545,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1545
1545
sigmas,
1546
1546
start_merge_step,
1547
1547
id_cond,
1548
- noise_mask );
1548
+ denoise_mask );
1549
1549
1550
1550
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
1551
1551
// print_ggml_tensor(x_0);
@@ -1756,7 +1756,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
1756
1756
1757
1757
sd_image_to_tensor (init_image.data , init_img);
1758
1758
1759
- ggml_tensor* masked_latent;
1759
+ ggml_tensor* concat_latent;
1760
+ ggml_tensor* denoise_mask = NULL ;
1760
1761
1761
1762
ggml_tensor* init_latent = NULL ;
1762
1763
ggml_tensor* init_moments = NULL ;
@@ -1776,63 +1777,65 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
1776
1777
// Restore init_img (encode_first_stage has side effects) TODO: remove the side effects?
1777
1778
sd_image_to_tensor (init_image.data , init_img);
1778
1779
sd_apply_mask (init_img, mask_img, masked_img);
1779
- ggml_tensor* masked_latent_0 = NULL ;
1780
+ ggml_tensor* masked_latent = NULL ;
1780
1781
if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1781
1782
ggml_tensor* moments = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1782
- masked_latent_0 = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1783
+ masked_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1783
1784
} else {
1784
- masked_latent_0 = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1785
+ masked_latent = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1785
1786
}
1786
- masked_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, masked_latent_0 ->ne [0 ], masked_latent_0 ->ne [1 ], mask_channels + masked_latent_0 ->ne [2 ], 1 );
1787
- for (int ix = 0 ; ix < masked_latent_0 ->ne [0 ]; ix++) {
1788
- for (int iy = 0 ; iy < masked_latent_0 ->ne [1 ]; iy++) {
1787
+ concat_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, masked_latent ->ne [0 ], masked_latent ->ne [1 ], mask_channels + masked_latent ->ne [2 ], 1 );
1788
+ for (int ix = 0 ; ix < masked_latent ->ne [0 ]; ix++) {
1789
+ for (int iy = 0 ; iy < masked_latent ->ne [1 ]; iy++) {
1789
1790
int mx = ix * 8 ;
1790
1791
int my = iy * 8 ;
1791
1792
if (sd_ctx->sd ->version == VERSION_FLUX_FILL) {
1792
- for (int k = 0 ; k < masked_latent_0 ->ne [2 ]; k++) {
1793
- float v = ggml_tensor_get_f32 (masked_latent_0 , ix, iy, k);
1794
- ggml_tensor_set_f32 (masked_latent , v, ix, iy, k);
1793
+ for (int k = 0 ; k < masked_latent ->ne [2 ]; k++) {
1794
+ float v = ggml_tensor_get_f32 (masked_latent , ix, iy, k);
1795
+ ggml_tensor_set_f32 (concat_latent , v, ix, iy, k);
1795
1796
}
1796
1797
// "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
1797
1798
for (int x = 0 ; x < 8 ; x++) {
1798
1799
for (int y = 0 ; y < 8 ; y++) {
1799
1800
float m = ggml_tensor_get_f32 (mask_img, mx + x, my + y);
1800
1801
// TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?)
1801
1802
// python code was using "b (h 8) (w 8) -> b (8 8) h w"
1802
- ggml_tensor_set_f32 (masked_latent , m, ix, iy, masked_latent_0 ->ne [2 ] + x * 8 + y);
1803
+ ggml_tensor_set_f32 (concat_latent , m, ix, iy, masked_latent ->ne [2 ] + x * 8 + y);
1803
1804
}
1804
1805
}
1805
1806
} else {
1806
1807
float m = ggml_tensor_get_f32 (mask_img, mx, my);
1807
- ggml_tensor_set_f32 (masked_latent , m, ix, iy, 0 );
1808
- for (int k = 0 ; k < masked_latent_0 ->ne [2 ]; k++) {
1809
- float v = ggml_tensor_get_f32 (masked_latent_0 , ix, iy, k);
1810
- ggml_tensor_set_f32 (masked_latent , v, ix, iy, k + mask_channels);
1808
+ ggml_tensor_set_f32 (concat_latent , m, ix, iy, 0 );
1809
+ for (int k = 0 ; k < masked_latent ->ne [2 ]; k++) {
1810
+ float v = ggml_tensor_get_f32 (masked_latent , ix, iy, k);
1811
+ ggml_tensor_set_f32 (concat_latent , v, ix, iy, k + mask_channels);
1811
1812
}
1812
1813
}
1813
1814
}
1814
1815
}
1815
1816
} else if (sd_version_is_edit (sd_ctx->sd ->version )) {
1816
- // Not actually masked, we're just highjacking the masked_latent variable since it will be used the same way
1817
+ // Not actually masked, we're just highjacking the concat_latent variable since it will be used the same way
1817
1818
if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1818
1819
if (sd_ctx->sd ->is_using_edm_v_parameterization ) {
1819
1820
// for CosXL edit
1820
- masked_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, init_moments);
1821
+ concat_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, init_moments);
1821
1822
} else {
1822
- masked_latent = sd_ctx->sd ->get_first_stage_encoding_mode (work_ctx, init_moments);
1823
+ concat_latent = sd_ctx->sd ->get_first_stage_encoding_mode (work_ctx, init_moments);
1823
1824
}
1824
1825
} else {
1825
- masked_latent = init_latent;
1826
+ concat_latent = init_latent;
1826
1827
}
1827
- } else {
1828
+ }
1829
+
1830
+ {
1828
1831
// LOG_WARN("Inpainting with a base model is not great");
1829
- masked_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, width / 8 , height / 8 , 1 , 1 );
1830
- for (int ix = 0 ; ix < masked_latent ->ne [0 ]; ix++) {
1831
- for (int iy = 0 ; iy < masked_latent ->ne [1 ]; iy++) {
1832
+ denoise_mask = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, width / 8 , height / 8 , 1 , 1 );
1833
+ for (int ix = 0 ; ix < denoise_mask ->ne [0 ]; ix++) {
1834
+ for (int iy = 0 ; iy < denoise_mask ->ne [1 ]; iy++) {
1832
1835
int mx = ix * 8 ;
1833
1836
int my = iy * 8 ;
1834
1837
float m = ggml_tensor_get_f32 (mask_img, mx, my);
1835
- ggml_tensor_set_f32 (masked_latent , m, ix, iy);
1838
+ ggml_tensor_set_f32 (denoise_mask , m, ix, iy);
1836
1839
}
1837
1840
}
1838
1841
}
@@ -1868,7 +1871,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
1868
1871
style_ratio,
1869
1872
normalize_input,
1870
1873
input_id_images_path_c_str,
1871
- masked_latent);
1874
+ concat_latent,
1875
+ denoise_mask);
1872
1876
1873
1877
size_t t2 = ggml_time_ms ();
1874
1878
0 commit comments