Kontext refactor

stduhpf · stduhpf · commit b8888fce7cfb · 2025-06-27T03:17:43.000+02:00
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
@@ -13,13 +13,13 @@ struct DiffusionModel {
                          struct ggml_tensor* c_concat,
                          struct ggml_tensor* y,
                          struct ggml_tensor* guidance,
-                         int num_video_frames                      = -1,
-                         std::vector<struct ggml_tensor*> controls = {},
-                         float control_strength                    = 0.f,
-                         bool kontext_concat                       = false,
-                         struct ggml_tensor** output               = NULL,
-                         struct ggml_context* output_ctx           = NULL,
-                         std::vector<int> skip_layers              = std::vector<int>())             = 0;
+                         int num_video_frames                          = -1,
+                         std::vector<struct ggml_tensor*> controls     = {},
+                         float control_strength                        = 0.f,
+                         std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
+                         struct ggml_tensor** output                   = NULL,
+                         struct ggml_context* output_ctx               = NULL,
+                         std::vector<int> skip_layers                  = std::vector<int>())             = 0;
     virtual void alloc_params_buffer()                                                  = 0;
     virtual void free_params_buffer()                                                   = 0;
     virtual void free_compute_buffer()                                                  = 0;
@@ -69,13 +69,13 @@ struct UNetModel : public DiffusionModel {
                  struct ggml_tensor* c_concat,
                  struct ggml_tensor* y,
                  struct ggml_tensor* guidance,
-                 int num_video_frames                      = -1,
-                 std::vector<struct ggml_tensor*> controls = {},
-                 float control_strength                    = 0.f,
-                 bool kontext_concat                       = false,
-                 struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL,
-                 std::vector<int> skip_layers              = std::vector<int>()) {
+                 int num_video_frames                          = -1,
+                 std::vector<struct ggml_tensor*> controls     = {},
+                 float control_strength                        = 0.f,
+                 std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
+                 struct ggml_tensor** output                   = NULL,
+                 struct ggml_context* output_ctx               = NULL,
+                 std::vector<int> skip_layers                  = std::vector<int>()) {
         (void)skip_layers;  // SLG doesn't work with UNet models
         return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
     }
@@ -123,7 +123,7 @@ struct MMDiTModel : public DiffusionModel {
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
-                 bool kontext_concat                       = false,
+                 std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
                  struct ggml_tensor** output               = NULL,
                  struct ggml_context* output_ctx           = NULL,
                  std::vector<int> skip_layers              = std::vector<int>()) {
@@ -172,14 +172,14 @@ struct FluxModel : public DiffusionModel {
                  struct ggml_tensor* c_concat,
                  struct ggml_tensor* y,
                  struct ggml_tensor* guidance,
-                 int num_video_frames                      = -1,
-                 std::vector<struct ggml_tensor*> controls = {},
-                 float control_strength                    = 0.f,
-                 bool kontext_concat                       = false,
-                 struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL,
-                 std::vector<int> skip_layers              = std::vector<int>()) {
-        return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, kontext_concat, output, output_ctx, skip_layers);
+                 int num_video_frames                          = -1,
+                 std::vector<struct ggml_tensor*> controls     = {},
+                 float control_strength                        = 0.f,
+                 std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
+                 struct ggml_tensor** output                   = NULL,
+                 struct ggml_context* output_ctx               = NULL,
+                 std::vector<int> skip_layers                  = std::vector<int>()) {
+        return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, kontext_imgs, output, output_ctx, skip_layers);
     }
 };
 
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -97,6 +97,8 @@ struct SDParams {
     std::string mask_path;
     std::string control_image_path;
 
+    std::vector<std::string> kontext_image_paths;
+
     std::string prompt;
     std::string negative_prompt;
     float min_cfg       = 1.0f;
@@ -289,6 +291,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --preview-path [PATH}              path to write preview image to (default: ./preview.png)\n");
     printf("  --color                            Colors the logging tags according to level\n");
     printf("  -v, --verbose                      print extra info\n");
+    printf("  -ki, --kontext_img [PATH]        Reference image for Flux Kontext models (can be used multiple times) \n");
 }
 
 void parse_args(int argc, const char** argv, SDParams& params) {
@@ -724,6 +727,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.imatrix_in.push_back(std::string(argv[i]));
+        } else if (arg == "-ki" || arg == "--kontext-img") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.kontext_image_paths.push_back(argv[i]);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             print_usage(argc, argv);
@@ -958,12 +967,10 @@ int main(int argc, const char* argv[]) {
                                                 params.skip_layer_end,
                                                 params.slg_scale,
                                             },
-                                            {
-                                                params.apg_eta,
-                                                params.apg_momentum,
-                                                params.apg_norm_threshold,
-                                                params.apg_norm_smoothing
-                                            }};
+                                            {params.apg_eta,
+                                             params.apg_momentum,
+                                             params.apg_norm_threshold,
+                                             params.apg_norm_smoothing}};
 
     sd_set_log_callback(sd_log_cb, (void*)&params);
     sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval);
@@ -1007,8 +1014,40 @@ int main(int argc, const char* argv[]) {
         fprintf(stderr, "SVD support is broken, do not use it!!!\n");
         return 1;
     }
-
     bool vae_decode_only          = true;
+
+    std::vector<sd_image_t> kontext_imgs;
+    for (auto& path : params.kontext_image_paths) {
+        vae_decode_only = false;
+        int c                 = 0;
+        int width             = 0;
+        int height            = 0;
+        uint8_t* image_buffer = stbi_load(path.c_str(), &width, &height, &c, 3);
+        if (image_buffer == NULL) {
+            fprintf(stderr, "load image from '%s' failed\n", path.c_str());
+            return 1;
+        }
+        if (c < 3) {
+            fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c);
+            free(image_buffer);
+            return 1;
+        }
+        if (width <= 0) {
+            fprintf(stderr, "error: the width of image must be greater than 0\n");
+            free(image_buffer);
+            return 1;
+        }
+        if (height <= 0) {
+            fprintf(stderr, "error: the height of image must be greater than 0\n");
+            free(image_buffer);
+            return 1;
+        }
+        kontext_imgs.push_back({(uint32_t)width,
+                                (uint32_t)height,
+                                3,
+                                image_buffer});
+    }
+
     uint8_t* input_image_buffer   = NULL;
     uint8_t* control_image_buffer = NULL;
     uint8_t* mask_image_buffer    = NULL;
@@ -1148,7 +1187,8 @@ int main(int argc, const char* argv[]) {
                           params.control_strength,
                           params.style_ratio,
                           params.normalize_input,
-                          params.input_id_images_path.c_str());
+                          params.input_id_images_path.c_str(),
+                          kontext_imgs.data(), kontext_imgs.size());
     } else {
         sd_image_t input_image = {(uint32_t)params.width,
                                   (uint32_t)params.height,
@@ -1210,7 +1250,8 @@ int main(int argc, const char* argv[]) {
                               params.control_strength,
                               params.style_ratio,
                               params.normalize_input,
-                              params.input_id_images_path.c_str());
+                              params.input_id_images_path.c_str(),
+                              kontext_imgs.data(), kontext_imgs.size());
         }
     }
 
diff --git a/flux.hpp b/flux.hpp
@@ -930,14 +930,13 @@ namespace Flux {
         }
 
         struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    struct ggml_tensor* x,
+                                    std::vector<struct ggml_tensor*> imgs,
                                     struct ggml_tensor* timestep,
                                     struct ggml_tensor* context,
                                     struct ggml_tensor* c_concat,
                                     struct ggml_tensor* y,
                                     struct ggml_tensor* guidance,
                                     struct ggml_tensor* pe,
-                                    bool kontext_concat          = false,
                                     struct ggml_tensor* arange   = NULL,
                                     std::vector<int> skip_layers = std::vector<int>(),
                                     SDVersion version            = VERSION_FLUX) {
@@ -951,19 +950,31 @@ namespace Flux {
             // pe: (L, d_head/2, 2, 2)
             // return: (N, C, H, W)
 
+            auto x = imgs[0];
             GGML_ASSERT(x->ne[3] == 1);
 
             int64_t W          = x->ne[0];
             int64_t H          = x->ne[1];
             int64_t C          = x->ne[2];
             int64_t patch_size = 2;
-            int pad_h          = (patch_size - H % patch_size) % patch_size;
-            int pad_w          = (patch_size - W % patch_size) % patch_size;
-            x                  = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            int pad_h          = (patch_size - x->ne[0] % patch_size) % patch_size;
+            int pad_w          = (patch_size - x->ne[1] % patch_size) % patch_size;
 
             // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
-            auto img                    = patchify(ctx, x, patch_size);  // [N, h*w, C * patch_size * patch_size]
-            int64_t patchified_img_size = img->ne[1];
+            ggml_tensor* img = NULL;  // [N, h*w, C * patch_size * patch_size]
+            int64_t patchified_img_size;
+            for (auto& x : imgs) {
+                int pad_h          = (patch_size - x->ne[0] % patch_size) % patch_size;
+                int pad_w          = (patch_size - x->ne[1] % patch_size) % patch_size;
+                ggml_tensor* pad_x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);
+                pad_x              = patchify(ctx, pad_x, patch_size);
+                if (img) {
+                    img = ggml_concat(ctx, img, pad_x, 1);
+                } else {
+                    img                 = pad_x;
+                    patchified_img_size = img->ne[1];
+                }
+            }
             if (version == VERSION_FLUX_FILL) {
                 GGML_ASSERT(c_concat != NULL);
                 ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
@@ -999,10 +1010,6 @@ namespace Flux {
                 control = patchify(ctx, control, patch_size);
 
                 img = ggml_concat(ctx, img, control, 0);
-            } else if (kontext_concat && c_concat != NULL) {
-                ggml_tensor* kontext = ggml_pad(ctx, c_concat, pad_w, pad_h, 0, 0);
-                kontext = patchify(ctx, kontext, patch_size);
-                img = ggml_concat(ctx, img, kontext, 1);
             }
 
             auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, arange, skip_layers);  // [N, h*w, C * patch_size * patch_size]
@@ -1097,8 +1104,8 @@ namespace Flux {
                                         struct ggml_tensor* c_concat,
                                         struct ggml_tensor* y,
                                         struct ggml_tensor* guidance,
-                                        bool kontext_concat          = false,
-                                        std::vector<int> skip_layers = std::vector<int>()) {
+                                        std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
+                                        std::vector<int> skip_layers                  = std::vector<int>()) {
             GGML_ASSERT(x->ne[3] == 1);
             struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
 
@@ -1109,6 +1116,9 @@ namespace Flux {
             if (c_concat != NULL) {
                 c_concat = to_backend(c_concat);
             }
+            for (auto &img : kontext_imgs){
+                img = to_backend(img);
+            }
             if (flux_params.is_chroma) {
                 const char* SD_CHROMA_ENABLE_GUIDANCE = getenv("SD_CHROMA_ENABLE_GUIDANCE");
                 bool disable_guidance                 = true;
@@ -1148,11 +1158,8 @@ namespace Flux {
             if (flux_params.guidance_embed || flux_params.is_chroma) {
                 guidance = to_backend(guidance);
             }
-
-            std::vector<struct ggml_tensor*> imgs{x};
-            if (kontext_concat && c_concat != NULL) {
-                imgs.push_back(c_concat);
-            }
+            auto imgs = kontext_imgs;
+            imgs.insert(imgs.begin(), x);
 
             pe_vec      = flux.gen_pe(imgs, context, 2, flux_params.theta, flux_params.axes_dim);
             int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
@@ -1175,14 +1182,13 @@ namespace Flux {
             // }
 
             struct ggml_tensor* out = flux.forward(compute_ctx,
-                                                   x,
+                                                   imgs,
                                                    timesteps,
                                                    context,
                                                    c_concat,
                                                    y,
                                                    guidance,
                                                    pe,
-                                                   kontext_concat,
                                                    precompute_arange,
                                                    skip_layers,
                                                    version);
@@ -1199,17 +1205,17 @@ namespace Flux {
                      struct ggml_tensor* c_concat,
                      struct ggml_tensor* y,
                      struct ggml_tensor* guidance,
-                     bool kontext_concat             = false,
-                     struct ggml_tensor** output     = NULL,
-                     struct ggml_context* output_ctx = NULL,
-                     std::vector<int> skip_layers    = std::vector<int>()) {
+                     std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
+                     struct ggml_tensor** output                   = NULL,
+                     struct ggml_context* output_ctx               = NULL,
+                     std::vector<int> skip_layers                  = std::vector<int>()) {
             // x: [N, in_channels, h, w]
             // timesteps: [N, ]
             // context: [N, max_position, hidden_size]
             // y: [N, adm_in_channels] or [1, adm_in_channels]
             // guidance: [N, ]
             auto get_graph = [&]() -> struct ggml_cgraph* {
-                return build_graph(x, timesteps, context, c_concat, y, guidance, kontext_concat, skip_layers);
+                return build_graph(x, timesteps, context, c_concat, y, guidance, kontext_imgs, skip_layers);
             };
 
             return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
@@ -1249,7 +1255,7 @@ namespace Flux {
                 struct ggml_tensor* out = NULL;
 
                 int t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, NULL, y, guidance, false, &out, work_ctx);
+                compute(8, x, timesteps, context, NULL, y, guidance, std::vector<struct ggml_tensor*>(), &out, work_ctx);
                 int t1 = ggml_time_ms();
 
                 LOG_DEBUG("flux test done in %dms", t1 - t0);
diff --git a/ggml b/ggml
@@ -1 +1 @@
-Subproject commit 988abe2ab374544af09d42aa7491dceaf6be04a1
+Subproject commit 9e4bee1c5afc2d677a5b32ecb90cbdb483e81fff
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
diff --git a/stable-diffusion.h b/stable-diffusion.h