From 769d0ab2e9dbbf8aa23eb0f9c5b11257daf9902d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 23 Mar 2025 01:12:39 +0100
Subject: [PATCH 01/15] Imatrix: first implementation attempt

---
 clip.hpp                        |    2 +
 examples/CMakeLists.txt         |    3 +-
 examples/cli/main.cpp           |    2 +-
 examples/imatrix/CMakeLists.txt |    7 +
 examples/imatrix/imatrix.cpp    | 1089 +++++++++++++++++++++++++++++++
 ggml_extend.hpp                 |   39 +-
 imatrix.hpp                     |  325 +++++++++
 model.cpp                       |   93 ++-
 model.h                         |    4 +-
 stable-diffusion.h              |    4 +-
 util.cpp                        |   17 +
 util.h                          |    4 +
 12 files changed, 1573 insertions(+), 16 deletions(-)
 create mode 100644 examples/imatrix/CMakeLists.txt
 create mode 100644 examples/imatrix/imatrix.cpp
 create mode 100644 imatrix.hpp
diff --git a/clip.hpp b/clip.hpp
index 2307ee3c5..059b7d0d8 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -661,6 +661,7 @@ class CLIPTextModel : public GGMLBlock {
         if (version == OPEN_CLIP_VIT_BIGG_14) {
             enum ggml_type wtype      = GGML_TYPE_F32;  // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32;
             params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
+            ggml_set_name(params["text_projection"], (prefix + "text_projection").c_str());
         }
     }
 
@@ -812,6 +813,7 @@ class CLIPProjection : public UnaryBlock {
         } else {
             params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
         }
+        ggml_set_name(params["weight"], (prefix + "weight").c_str());
     }
 
 public:
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 81053f9e2..d1806c7ae 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,3 +1,4 @@
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
-add_subdirectory(cli)
\ No newline at end of file
+add_subdirectory(cli)
+add_subdirectory(imatrix)
\ No newline at end of file
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index af6b2bbdb..5cbd0320c 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -800,7 +800,7 @@ int main(int argc, const char* argv[]) {
     }
 
     if (params.mode == CONVERT) {
-        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype);
+        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype,NULL);
         if (!success) {
             fprintf(stderr,
                     "convert '%s'/'%s' to '%s' failed\n",
diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt
new file mode 100644
index 000000000..4c5e46939
--- /dev/null
+++ b/examples/imatrix/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(TARGET sd-imatrix)
+
+add_executable(${TARGET} imatrix.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
\ No newline at end of file
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
new file mode 100644
index 000000000..a974bb1d7
--- /dev/null
+++ b/examples/imatrix/imatrix.cpp
@@ -0,0 +1,1089 @@
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <iostream>
+#include <random>
+#include <regex>
+#include <string>
+#include <vector>
+
+#include "stable-diffusion.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_STATIC
+#include "stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#define STB_IMAGE_WRITE_STATIC
+#include "stb_image_write.h"
+
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#define STB_IMAGE_RESIZE_STATIC
+#include "stb_image_resize.h"
+
+const char* rng_type_to_str[] = {
+    "std_default",
+    "cuda",
+};
+
+// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
+const char* sample_method_str[] = {
+    "euler_a",
+    "euler",
+    "heun",
+    "dpm2",
+    "dpm++2s_a",
+    "dpm++2m",
+    "dpm++2mv2",
+    "ipndm",
+    "ipndm_v",
+    "lcm",
+    "ddim_trailing",
+    "tcd",
+};
+
+// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
+const char* schedule_str[] = {
+    "default",
+    "discrete",
+    "karras",
+    "exponential",
+    "ays",
+    "gits",
+};
+
+const char* modes_str[] = {
+    "txt2img",
+    "img2img",
+    "img2vid",
+    "convert",
+};
+
+const char* previews_str[] = {
+    "none",
+    "proj",
+    "tae",
+    "vae",
+};
+
+enum SDMode {
+    TXT2IMG,
+    IMG2IMG,
+    IMG2VID,
+    CONVERT,
+    MODE_COUNT
+};
+
+struct SDParams {
+    int n_threads = -1;
+    SDMode mode   = TXT2IMG;
+    std::string model_path;
+    std::string clip_l_path;
+    std::string clip_g_path;
+    std::string t5xxl_path;
+    std::string diffusion_model_path;
+    std::string vae_path;
+    std::string taesd_path;
+    std::string esrgan_path;
+    std::string controlnet_path;
+    std::string embeddings_path;
+    std::string stacked_id_embeddings_path;
+    std::string input_id_images_path;
+    sd_type_t wtype = SD_TYPE_COUNT;
+    std::string lora_model_dir;
+    std::string output_path = "output.png";
+    std::string input_path;
+    std::string mask_path;
+    std::string control_image_path;
+
+    std::string prompt;
+    std::string negative_prompt;
+    float min_cfg     = 1.0f;
+    float cfg_scale   = 7.0f;
+    float guidance    = 3.5f;
+    float eta         = 0.f;
+    float style_ratio = 20.f;
+    int clip_skip     = -1;  // <= 0 represents unspecified
+    int width         = 512;
+    int height        = 512;
+    int batch_count   = 1;
+
+    int video_frames         = 6;
+    int motion_bucket_id     = 127;
+    int fps                  = 6;
+    float augmentation_level = 0.f;
+
+    sample_method_t sample_method = EULER_A;
+    schedule_t schedule           = DEFAULT;
+    int sample_steps              = 20;
+    float strength                = 0.75f;
+    float control_strength        = 0.9f;
+    rng_type_t rng_type           = CUDA_RNG;
+    int64_t seed                  = 42;
+    bool verbose                  = false;
+    bool vae_tiling               = false;
+    bool control_net_cpu          = false;
+    bool normalize_input          = false;
+    bool clip_on_cpu              = false;
+    bool vae_on_cpu               = false;
+    bool diffusion_flash_attn     = false;
+    bool canny_preprocess         = false;
+    bool color                    = false;
+    int upscale_repeats           = 1;
+
+    std::vector<int> skip_layers = {7, 8, 9};
+    float slg_scale              = 0.0f;
+    float skip_layer_start       = 0.01f;
+    float skip_layer_end         = 0.2f;
+
+    /* Imatrix params */
+
+    bool process_output = false;
+    int n_out_freq      = 0;
+    int n_save_freq     = 0;
+
+    std::string out_file = "imatrix.dat";
+
+    std::vector<std::string> in_files = {};
+};
+
+#include "imatrix.hpp"
+
+static IMatrixCollector g_collector;
+
+/* Enables Printing the log level tag in color using ANSI escape codes */
+void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
+    SDParams* params = (SDParams*)data;
+    int tag_color;
+    const char* level_str;
+    FILE* out_stream = (level == SD_LOG_ERROR) ? stderr : stdout;
+
+    if (!log || (!params->verbose && level <= SD_LOG_DEBUG)) {
+        return;
+    }
+
+    switch (level) {
+        case SD_LOG_DEBUG:
+            tag_color = 37;
+            level_str = "DEBUG";
+            break;
+        case SD_LOG_INFO:
+            tag_color = 34;
+            level_str = "INFO";
+            break;
+        case SD_LOG_WARN:
+            tag_color = 35;
+            level_str = "WARN";
+            break;
+        case SD_LOG_ERROR:
+            tag_color = 31;
+            level_str = "ERROR";
+            break;
+        default: /* Potential future-proofing */
+            tag_color = 33;
+            level_str = "?????";
+            break;
+    }
+
+    if (params->color == true) {
+        fprintf(out_stream, "\033[%d;1m[%-5s]\033[0m ", tag_color, level_str);
+    } else {
+        fprintf(out_stream, "[%-5s] ", level_str);
+    }
+    fputs(log, out_stream);
+    fflush(out_stream);
+}
+void print_params(SDParams params) {
+    (void)params;
+}
+
+void print_usage(int, const char** argv) {
+    printf("\nexample usage:\n");
+    printf(
+        "\n    %s \\\n"
+        "       {same as sd.exe} [-O imatrix.dat]\\\n"
+        "       [--output-frequency 10] [--save-frequency 0] \\\n"
+        "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n",
+        argv[0]);
+    printf("\n");
+}
+
+void parse_args(int argc, const char** argv, SDParams& params) {
+    bool invalid_arg = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-M" || arg == "--mode") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            const char* mode_selected = argv[i];
+            int mode_found            = -1;
+            for (int d = 0; d < MODE_COUNT; d++) {
+                if (!strcmp(mode_selected, modes_str[d])) {
+                    mode_found = d;
+                }
+            }
+            if (mode_found == -1) {
+                fprintf(stderr,
+                        "error: invalid mode %s, must be one of [txt2img, img2img, img2vid, convert]\n",
+                        mode_selected);
+                exit(1);
+            }
+            params.mode = (SDMode)mode_found;
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.model_path = argv[i];
+        } else if (arg == "--clip_l") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.clip_l_path = argv[i];
+        } else if (arg == "--clip_g") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.clip_g_path = argv[i];
+        } else if (arg == "--t5xxl") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.t5xxl_path = argv[i];
+        } else if (arg == "--diffusion-model") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.diffusion_model_path = argv[i];
+        } else if (arg == "--vae") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.vae_path = argv[i];
+        } else if (arg == "--taesd") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.taesd_path = argv[i];
+        } else if (arg == "--control-net") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.controlnet_path = argv[i];
+        } else if (arg == "--upscale-model") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.esrgan_path = argv[i];
+        } else if (arg == "--embd-dir") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.embeddings_path = argv[i];
+        } else if (arg == "--stacked-id-embd-dir") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.stacked_id_embeddings_path = argv[i];
+        } else if (arg == "--input-id-images-dir") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.input_id_images_path = argv[i];
+        } else if (arg == "--type") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            std::string type        = argv[i];
+            bool found              = false;
+            std::string valid_types = "";
+            for (size_t i = 0; i < SD_TYPE_COUNT; i++) {
+                auto trait = ggml_get_type_traits((ggml_type)i);
+                std::string name(trait->type_name);
+                if (name == "f32" || trait->to_float && trait->type_size) {
+                    if (i)
+                        valid_types += ", ";
+                    valid_types += name;
+                    if (type == name) {
+                        if (ggml_quantize_requires_imatrix((ggml_type)i)) {
+                            printf("\033[35;1m[WARNING]\033[0m: type %s requires imatrix to work properly. A dummy imatrix will be used, expect poor quality.\n", trait->type_name);
+                        }
+                        params.wtype = (enum sd_type_t)i;
+                        found        = true;
+                        break;
+                    }
+                }
+            }
+            if (!found) {
+                fprintf(stderr, "error: invalid weight format %s, must be one of [%s]\n",
+                        type.c_str(),
+                        valid_types.c_str());
+                exit(1);
+            }
+        } else if (arg == "--lora-model-dir") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.lora_model_dir = argv[i];
+        } else if (arg == "-i" || arg == "--init-img") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.input_path = argv[i];
+        } else if (arg == "--mask") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.mask_path = argv[i];
+        } else if (arg == "--control-image") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.control_image_path = argv[i];
+        } else if (arg == "-o" || arg == "--output") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.output_path = argv[i];
+        } else if (arg == "-p" || arg == "--prompt") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.prompt = argv[i];
+        } else if (arg == "--upscale-repeats") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.upscale_repeats = std::stoi(argv[i]);
+            if (params.upscale_repeats < 1) {
+                fprintf(stderr, "error: upscale multiplier must be at least 1\n");
+                exit(1);
+            }
+        } else if (arg == "-n" || arg == "--negative-prompt") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.negative_prompt = argv[i];
+        } else if (arg == "--cfg-scale") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.cfg_scale = std::stof(argv[i]);
+        } else if (arg == "--guidance") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.guidance = std::stof(argv[i]);
+        } else if (arg == "--eta") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.eta = std::stof(argv[i]);
+        } else if (arg == "--strength") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.strength = std::stof(argv[i]);
+        } else if (arg == "--style-ratio") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.style_ratio = std::stof(argv[i]);
+        } else if (arg == "--control-strength") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.control_strength = std::stof(argv[i]);
+        } else if (arg == "-H" || arg == "--height") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.height = std::stoi(argv[i]);
+        } else if (arg == "-W" || arg == "--width") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.width = std::stoi(argv[i]);
+        } else if (arg == "--steps") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.sample_steps = std::stoi(argv[i]);
+        } else if (arg == "--clip-skip") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.clip_skip = std::stoi(argv[i]);
+        } else if (arg == "--vae-tiling") {
+            params.vae_tiling = true;
+        } else if (arg == "--control-net-cpu") {
+            params.control_net_cpu = true;
+        } else if (arg == "--normalize-input") {
+            params.normalize_input = true;
+        } else if (arg == "--clip-on-cpu") {
+            params.clip_on_cpu = true;  // will slow down get_learned_condiotion but necessary for low MEM GPUs
+        } else if (arg == "--vae-on-cpu") {
+            params.vae_on_cpu = true;  // will slow down latent decoding but necessary for low MEM GPUs
+        } else if (arg == "--diffusion-fa") {
+            params.diffusion_flash_attn = true;  // can reduce MEM significantly
+        } else if (arg == "--canny") {
+            params.canny_preprocess = true;
+        } else if (arg == "-b" || arg == "--batch-count") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.batch_count = std::stoi(argv[i]);
+        } else if (arg == "--rng") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            std::string rng_type_str = argv[i];
+            if (rng_type_str == "std_default") {
+                params.rng_type = STD_DEFAULT_RNG;
+            } else if (rng_type_str == "cuda") {
+                params.rng_type = CUDA_RNG;
+            } else {
+                invalid_arg = true;
+                break;
+            }
+        } else if (arg == "--schedule") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            const char* schedule_selected = argv[i];
+            int schedule_found            = -1;
+            for (int d = 0; d < N_SCHEDULES; d++) {
+                if (!strcmp(schedule_selected, schedule_str[d])) {
+                    schedule_found = d;
+                }
+            }
+            if (schedule_found == -1) {
+                invalid_arg = true;
+                break;
+            }
+            params.schedule = (schedule_t)schedule_found;
+        } else if (arg == "-s" || arg == "--seed") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.seed = std::stoll(argv[i]);
+        } else if (arg == "--sampling-method") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            const char* sample_method_selected = argv[i];
+            int sample_method_found            = -1;
+            for (int m = 0; m < N_SAMPLE_METHODS; m++) {
+                if (!strcmp(sample_method_selected, sample_method_str[m])) {
+                    sample_method_found = m;
+                }
+            }
+            if (sample_method_found == -1) {
+                invalid_arg = true;
+                break;
+            }
+            params.sample_method = (sample_method_t)sample_method_found;
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-v" || arg == "--verbose") {
+            params.verbose = true;
+        } else if (arg == "--color") {
+            params.color = true;
+        } else if (arg == "--slg-scale") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.slg_scale = std::stof(argv[i]);
+        } else if (arg == "--skip-layers") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            if (argv[i][0] != '[') {
+                invalid_arg = true;
+                break;
+            }
+            std::string layers_str = argv[i];
+            while (layers_str.back() != ']') {
+                if (++i >= argc) {
+                    invalid_arg = true;
+                    break;
+                }
+                layers_str += " " + std::string(argv[i]);
+            }
+            layers_str = layers_str.substr(1, layers_str.size() - 2);
+
+            std::regex regex("[, ]+");
+            std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1);
+            std::sregex_token_iterator end;
+            std::vector<std::string> tokens(iter, end);
+            std::vector<int> layers;
+            for (const auto& token : tokens) {
+                try {
+                    layers.push_back(std::stoi(token));
+                } catch (const std::invalid_argument& e) {
+                    invalid_arg = true;
+                    break;
+                }
+            }
+            params.skip_layers = layers;
+
+            if (invalid_arg) {
+                break;
+            }
+        } else if (arg == "--skip-layer-start") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.skip_layer_start = std::stof(argv[i]);
+        } else if (arg == "--skip-layer-end") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.skip_layer_end = std::stof(argv[i]);
+        } else if (arg == "-O") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.out_file = argv[i];
+        } else if (arg == "--output-frequency") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.n_out_freq = std::stoi(argv[i]);
+        } else if (arg == "--save-frequency") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.n_out_freq = std::stoi(argv[i]);
+        } else if (arg == "--in-file") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.in_files.push_back(std::string(argv[i]));
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            print_usage(argc, argv);
+            exit(1);
+        }
+    }
+    if (invalid_arg) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv);
+        exit(1);
+    }
+    if (params.n_threads <= 0) {
+        params.n_threads = get_num_physical_cores();
+    }
+
+    if (params.mode != CONVERT && params.mode != IMG2VID && params.prompt.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: prompt\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
+        fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (params.output_path.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: output_path\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (params.width <= 0 || params.width % 64 != 0) {
+        fprintf(stderr, "error: the width must be a multiple of 64\n");
+        exit(1);
+    }
+
+    if (params.height <= 0 || params.height % 64 != 0) {
+        fprintf(stderr, "error: the height must be a multiple of 64\n");
+        exit(1);
+    }
+
+    if (params.sample_steps <= 0) {
+        fprintf(stderr, "error: the sample_steps must be greater than 0\n");
+        exit(1);
+    }
+
+    if (params.strength < 0.f || params.strength > 1.f) {
+        fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
+        exit(1);
+    }
+
+    if (params.seed < 0) {
+        srand((int)time(NULL));
+        params.seed = rand();
+    }
+
+    if (params.mode == CONVERT) {
+        if (params.output_path == "output.png") {
+            params.output_path = "output.gguf";
+        }
+    }
+}
+
+static std::string sd_basename(const std::string& path) {
+    size_t pos = path.find_last_of('/');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    pos = path.find_last_of('\\');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    return path;
+}
+
+std::string get_image_params(SDParams params, int64_t seed) {
+    std::string parameter_string = params.prompt + "\n";
+    if (params.negative_prompt.size() != 0) {
+        parameter_string += "Negative prompt: " + params.negative_prompt + "\n";
+    }
+    parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
+    parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
+    if (params.slg_scale != 0 && params.skip_layers.size() != 0) {
+        parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", ";
+        parameter_string += "Skip layers: [";
+        for (const auto& layer : params.skip_layers) {
+            parameter_string += std::to_string(layer) + ", ";
+        }
+        parameter_string += "], ";
+        parameter_string += "Skip layer start: " + std::to_string(params.skip_layer_start) + ", ";
+        parameter_string += "Skip layer end: " + std::to_string(params.skip_layer_end) + ", ";
+    }
+    parameter_string += "Guidance: " + std::to_string(params.guidance) + ", ";
+    parameter_string += "Eta: " + std::to_string(params.eta) + ", ";
+    parameter_string += "Seed: " + std::to_string(seed) + ", ";
+    parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
+    parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
+    parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
+    parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
+    if (params.schedule == KARRAS) {
+        parameter_string += " karras";
+    }
+    parameter_string += ", ";
+    parameter_string += "Version: stable-diffusion.cpp";
+    return parameter_string;
+}
+
+const char* preview_path;
+
+void step_callback(int step, sd_image_t image) {
+    stbi_write_png(preview_path, image.width, image.height, image.channel, image.data, 0);
+}
+
+static bool collect_imatrix(struct ggml_tensor* t, bool ask, void* user_data) {
+    return g_collector.collect_imatrix(t, ask, user_data);
+}
+
+int main(int argc, const char** argv) {
+    SDParams params;
+
+    parse_args(argc, argv, params);
+
+    sd_set_log_callback(sd_log_cb, (void*)&params);
+
+    if (params.verbose) {
+        print_params(params);
+        printf("%s", sd_get_system_info());
+    }
+
+    g_collector.set_params(params);
+
+    for (const auto& in_file : params.in_files) {
+        printf("loading imatrix from '%s'\n", in_file.c_str());
+        if (!g_collector.load_imatrix(in_file.c_str())) {
+            LOG_ERROR("failed to load %s\n", in_file.c_str());
+            return 1;
+        }
+    }
+
+    sd_set_backend_eval_callback((sd_graph_eval_callback_t)collect_imatrix, &params);
+
+    if (params.mode == CONVERT) {
+        const char* imatrix_file = NULL;
+        if (params.in_files.size() > 0) {
+            imatrix_file = params.in_files[0].c_str();
+        }
+        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, imatrix_file);
+        if (!success) {
+            fprintf(stderr,
+                    "convert '%s'/'%s' to '%s' failed\n",
+                    params.model_path.c_str(),
+                    params.vae_path.c_str(),
+                    params.output_path.c_str());
+            return 1;
+        } else {
+            printf("convert '%s'/'%s' to '%s' success\n",
+                   params.model_path.c_str(),
+                   params.vae_path.c_str(),
+                   params.output_path.c_str());
+            return 0;
+        }
+    }
+
+    if (params.mode == IMG2VID) {
+        fprintf(stderr, "SVD support is broken, do not use it!!!\n");
+        return 1;
+    }
+
+    bool vae_decode_only          = true;
+    uint8_t* input_image_buffer   = NULL;
+    uint8_t* control_image_buffer = NULL;
+    uint8_t* mask_image_buffer    = NULL;
+
+    if (params.mode == IMG2IMG || params.mode == IMG2VID) {
+        vae_decode_only = false;
+
+        int c              = 0;
+        int width          = 0;
+        int height         = 0;
+        input_image_buffer = stbi_load(params.input_path.c_str(), &width, &height, &c, 3);
+        if (input_image_buffer == NULL) {
+            fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str());
+            return 1;
+        }
+        if (c < 3) {
+            fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c);
+            free(input_image_buffer);
+            return 1;
+        }
+        if (width <= 0) {
+            fprintf(stderr, "error: the width of image must be greater than 0\n");
+            free(input_image_buffer);
+            return 1;
+        }
+        if (height <= 0) {
+            fprintf(stderr, "error: the height of image must be greater than 0\n");
+            free(input_image_buffer);
+            return 1;
+        }
+
+        // Resize input image ...
+        if (params.height != height || params.width != width) {
+            printf("resize input image from %dx%d to %dx%d\n", width, height, params.width, params.height);
+            int resized_height = params.height;
+            int resized_width  = params.width;
+
+            uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * 3);
+            if (resized_image_buffer == NULL) {
+                fprintf(stderr, "error: allocate memory for resize input image\n");
+                free(input_image_buffer);
+                return 1;
+            }
+            stbir_resize(input_image_buffer, width, height, 0,
+                         resized_image_buffer, resized_width, resized_height, 0, STBIR_TYPE_UINT8,
+                         3 /*RGB channel*/, STBIR_ALPHA_CHANNEL_NONE, 0,
+                         STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP,
+                         STBIR_FILTER_BOX, STBIR_FILTER_BOX,
+                         STBIR_COLORSPACE_SRGB, nullptr);
+
+            // Save resized result
+            free(input_image_buffer);
+            input_image_buffer = resized_image_buffer;
+        }
+    }
+
+    sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
+                                  params.clip_l_path.c_str(),
+                                  params.clip_g_path.c_str(),
+                                  params.t5xxl_path.c_str(),
+                                  params.diffusion_model_path.c_str(),
+                                  params.vae_path.c_str(),
+                                  params.taesd_path.c_str(),
+                                  params.controlnet_path.c_str(),
+                                  params.lora_model_dir.c_str(),
+                                  params.embeddings_path.c_str(),
+                                  params.stacked_id_embeddings_path.c_str(),
+                                  vae_decode_only,
+                                  params.vae_tiling,
+                                  true,
+                                  params.n_threads,
+                                  params.wtype,
+                                  params.rng_type,
+                                  params.schedule,
+                                  params.clip_on_cpu,
+                                  params.control_net_cpu,
+                                  params.vae_on_cpu,
+                                  params.diffusion_flash_attn);
+
+    if (sd_ctx == NULL) {
+        printf("new_sd_ctx_t failed\n");
+        return 1;
+    }
+
+    sd_image_t* control_image = NULL;
+    if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) {
+        int c                = 0;
+        control_image_buffer = stbi_load(params.control_image_path.c_str(), &params.width, &params.height, &c, 3);
+        if (control_image_buffer == NULL) {
+            fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str());
+            return 1;
+        }
+        control_image = new sd_image_t{(uint32_t)params.width,
+                                       (uint32_t)params.height,
+                                       3,
+                                       control_image_buffer};
+        if (params.canny_preprocess) {  // apply preprocessor
+            control_image->data = preprocess_canny(control_image->data,
+                                                   control_image->width,
+                                                   control_image->height,
+                                                   0.08f,
+                                                   0.08f,
+                                                   0.8f,
+                                                   1.0f,
+                                                   false);
+        }
+    }
+
+    std::vector<uint8_t> default_mask_image_vec(params.width * params.height, 255);
+    if (params.mask_path != "") {
+        int c             = 0;
+        mask_image_buffer = stbi_load(params.mask_path.c_str(), &params.width, &params.height, &c, 1);
+    } else {
+        mask_image_buffer = default_mask_image_vec.data();
+    }
+    sd_image_t mask_image = {(uint32_t)params.width,
+                             (uint32_t)params.height,
+                             1,
+                             mask_image_buffer};
+
+    sd_image_t* results;
+    if (params.mode == TXT2IMG) {
+        results = txt2img(sd_ctx,
+                          params.prompt.c_str(),
+                          params.negative_prompt.c_str(),
+                          params.clip_skip,
+                          params.cfg_scale,
+                          params.guidance,
+                          params.eta,
+                          params.width,
+                          params.height,
+                          params.sample_method,
+                          params.sample_steps,
+                          params.seed,
+                          params.batch_count,
+                          control_image,
+                          params.control_strength,
+                          params.style_ratio,
+                          params.normalize_input,
+                          params.input_id_images_path.c_str(),
+                          params.skip_layers.data(),
+                          params.skip_layers.size(),
+                          params.slg_scale,
+                          params.skip_layer_start,
+                          params.skip_layer_end);
+    } else {
+        sd_image_t input_image = {(uint32_t)params.width,
+                                  (uint32_t)params.height,
+                                  3,
+                                  input_image_buffer};
+
+        if (params.mode == IMG2VID) {
+            results = img2vid(sd_ctx,
+                              input_image,
+                              params.width,
+                              params.height,
+                              params.video_frames,
+                              params.motion_bucket_id,
+                              params.fps,
+                              params.augmentation_level,
+                              params.min_cfg,
+                              params.cfg_scale,
+                              params.sample_method,
+                              params.sample_steps,
+                              params.strength,
+                              params.seed);
+            if (results == NULL) {
+                printf("generate failed\n");
+                free_sd_ctx(sd_ctx);
+                return 1;
+            }
+            size_t last            = params.output_path.find_last_of(".");
+            std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
+            for (int i = 0; i < params.video_frames; i++) {
+                if (results[i].data == NULL) {
+                    continue;
+                }
+                std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
+                stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
+                               results[i].data, 0, get_image_params(params, params.seed + i).c_str());
+                printf("save result image to '%s'\n", final_image_path.c_str());
+                free(results[i].data);
+                results[i].data = NULL;
+            }
+            free(results);
+            free_sd_ctx(sd_ctx);
+            return 0;
+        } else {
+            results = img2img(sd_ctx,
+                              input_image,
+                              mask_image,
+                              params.prompt.c_str(),
+                              params.negative_prompt.c_str(),
+                              params.clip_skip,
+                              params.cfg_scale,
+                              params.guidance,
+                              params.eta,
+                              params.width,
+                              params.height,
+                              params.sample_method,
+                              params.sample_steps,
+                              params.strength,
+                              params.seed,
+                              params.batch_count,
+                              control_image,
+                              params.control_strength,
+                              params.style_ratio,
+                              params.normalize_input,
+                              params.input_id_images_path.c_str(),
+                              params.skip_layers.data(),
+                              params.skip_layers.size(),
+                              params.slg_scale,
+                              params.skip_layer_start,
+                              params.skip_layer_end);
+        }
+    }
+
+    if (results == NULL) {
+        printf("generate failed\n");
+        free_sd_ctx(sd_ctx);
+        return 1;
+    }
+
+    int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
+    if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
+        upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
+                                                        params.n_threads);
+
+        if (upscaler_ctx == NULL) {
+            printf("new_upscaler_ctx failed\n");
+        } else {
+            for (int i = 0; i < params.batch_count; i++) {
+                if (results[i].data == NULL) {
+                    continue;
+                }
+                sd_image_t current_image = results[i];
+                for (int u = 0; u < params.upscale_repeats; ++u) {
+                    sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor);
+                    if (upscaled_image.data == NULL) {
+                        printf("upscale failed\n");
+                        break;
+                    }
+                    free(current_image.data);
+                    current_image = upscaled_image;
+                }
+                results[i] = current_image;  // Set the final upscaled image as the result
+            }
+        }
+    }
+
+    std::string dummy_name, ext, lc_ext;
+    bool is_jpg;
+    size_t last      = params.output_path.find_last_of(".");
+    size_t last_path = std::min(params.output_path.find_last_of("/"),
+                                params.output_path.find_last_of("\\"));
+    if (last != std::string::npos  // filename has extension
+        && (last_path == std::string::npos || last > last_path)) {
+        dummy_name = params.output_path.substr(0, last);
+        ext = lc_ext = params.output_path.substr(last);
+        std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower);
+        is_jpg = lc_ext == ".jpg" || lc_ext == ".jpeg" || lc_ext == ".jpe";
+    } else {
+        dummy_name = params.output_path;
+        ext = lc_ext = "";
+        is_jpg       = false;
+    }
+    // appending ".png" to absent or unknown extension
+    if (!is_jpg && lc_ext != ".png") {
+        dummy_name += ext;
+        ext = ".png";
+    }
+    for (int i = 0; i < params.batch_count; i++) {
+        if (results[i].data == NULL) {
+            continue;
+        }
+        std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
+        if (is_jpg) {
+            stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
+                           results[i].data, 90, get_image_params(params, params.seed + i).c_str());
+            printf("save result JPEG image to '%s'\n", final_image_path.c_str());
+        } else {
+            stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
+                           results[i].data, 0, get_image_params(params, params.seed + i).c_str());
+            printf("save result PNG image to '%s'\n", final_image_path.c_str());
+        }
+        free(results[i].data);
+        results[i].data = NULL;
+    }
+    g_collector.save_imatrix();
+    free(results);
+    free_sd_ctx(sd_ctx);
+    free(control_image_buffer);
+    free(input_image_buffer);
+
+    return 0;
+}
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index c5913be4d..115f0555a 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -23,9 +23,11 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml-cpu.h"
+#include "ggml/src/ggml-impl.h"
 #include "ggml.h"
 
 #include "model.h"
+#include "util.h"
 
 #ifdef SD_USE_CUDA
 #include "ggml-cuda.h"
@@ -1241,7 +1243,39 @@ struct GGMLRunner {
             ggml_backend_cpu_set_n_threads(backend, n_threads);
         }
 
-        ggml_backend_graph_compute(backend, gf);
+        auto callback_eval = get_callback_eval();
+        
+        if(!callback_eval){
+            ggml_backend_graph_compute(backend, gf);
+        }else{
+            void * callback_eval_user_data = get_callback_eval_user_data();
+            for (int j0 = 0; j0 < gf->n_nodes; j0++) {
+                struct ggml_tensor * t = gf->nodes[j0];
+                
+                // check if the user needs data from this node
+                bool need = callback_eval(t, true, callback_eval_user_data);
+                
+                int j1 = j0;
+                
+                // determine the range [j0, j1] of nodes that can be computed together
+                while (!need && j1 < gf->n_nodes - 1) {
+                    t = gf->nodes[++j1];
+                    need = callback_eval(t, true, callback_eval_user_data);
+                }
+                
+                struct ggml_cgraph gv = ggml_graph_view(gf, j0, j1 + 1);
+                
+                ggml_backend_graph_compute_async(backend, &gv);
+                
+                if (need && !callback_eval(t, false, callback_eval_user_data)) {
+                    break;
+                }
+                
+                j0 = j1;
+            }
+            ggml_backend_synchronize(backend);
+        }
+
 #ifdef GGML_PERF
         ggml_graph_print(gf);
 #endif
@@ -1345,6 +1379,7 @@ class Linear : public UnaryBlock {
             wtype = GGML_TYPE_F32;
         }
         params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
+        ggml_set_name(params["weight"], (prefix + "weight").c_str());
         if (bias) {
             enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.ypes.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32;
             params["bias"]       = ggml_new_tensor_1d(ctx, wtype, out_features);
@@ -1508,6 +1543,8 @@ class LayerNorm : public UnaryBlock {
         if (elementwise_affine) {
             enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.ypes.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
             params["weight"]     = ggml_new_tensor_1d(ctx, wtype, normalized_shape);
+            ggml_set_name(params["weight"], (prefix + "weight").c_str());
+
             if (bias) {
                 enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.ypes.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32;
                 params["bias"]       = ggml_new_tensor_1d(ctx, wtype, normalized_shape);
diff --git a/imatrix.hpp b/imatrix.hpp
new file mode 100644
index 000000000..946814a51
--- /dev/null
+++ b/imatrix.hpp
@@ -0,0 +1,325 @@
+#include "ggml-backend.h"
+#include "ggml.h"
+#include "util.h"
+
+#include <fstream>
+#include <mutex>
+#include <unordered_map>
+
+/*Stolen from llama.cpp (credits: Kawrakow)*/
+
+struct Stats {
+    std::vector<float> values{};
+    std::vector<int> counts{};
+    int ncall = 0;
+};
+
+class IMatrixCollector {
+public:
+    IMatrixCollector() = default;
+    void set_params(SDParams params) { m_params = std::move(params); }
+    bool collect_imatrix(struct ggml_tensor* t, bool ask, void* user_data);
+    void save_imatrix(int ncall = -1) const;
+    bool load_imatrix(const char* fname);
+
+private:
+    std::unordered_map<std::string, Stats> m_stats = {};
+    SDParams m_params;
+    std::mutex m_mutex;
+    int m_last_call = 0;
+    std::vector<float> m_src1_data;
+    std::vector<char> m_ids;  // the expert ids from ggml_mul_mat_id
+};
+
+// remove any prefix and suffixes from the name
+// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
+static std::string filter_tensor_name(const char* name) {
+    std::string wname;
+    const char* p = strchr(name, '#');
+    if (p != NULL) {
+        p             = p + 1;
+        const char* q = strchr(p, '#');
+        if (q != NULL) {
+            wname = std::string(p, q - p);
+        } else {
+            wname = p;
+        }
+    } else {
+        wname = name;
+    }
+    return wname;
+}
+
+bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* user_data) {
+    GGML_UNUSED(user_data);
+    const struct ggml_tensor* src0 = t->src[0];
+    const struct ggml_tensor* src1 = t->src[1];
+    std::string wname              = filter_tensor_name(src0->name);
+
+    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
+    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
+    if (ask) {
+        if (t->op == GGML_OP_MUL_MAT_ID)
+            return true;  // collect all indirect matrix multiplications
+        if (t->op != GGML_OP_MUL_MAT)
+            return false;
+        // why are small batches ignored (<16 tokens)?
+        // if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
+        if (!(wname.substr(0, 6) == "model." || wname.substr(0, 17) == "cond_stage_model."))
+            return false;
+        return true;
+    }
+    // LOG_DEBUG("%s", wname.c_str());
+
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    // copy the data from the GPU memory if needed
+    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
+
+    if (!is_host) {
+        m_src1_data.resize(ggml_nelements(src1));
+        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
+    }
+
+    const float* data = is_host ? (const float*)src1->data : m_src1_data.data();
+
+    // this has been adapted to the new format of storing merged experts in a single 3d tensor
+    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
+    if (t->op == GGML_OP_MUL_MAT_ID) {
+        //   ids  -> [n_experts_used, n_tokens]
+        //   src1 -> [cols, n_expert_used, n_tokens]
+        const ggml_tensor* ids = t->src[2];
+        const int n_as         = src0->ne[2];
+        const int n_ids        = ids->ne[0];
+
+        // the top-k selected expert ids are stored in the ids tensor
+        // for simplicity, always copy ids to host, because it is small
+        // take into account that ids is not contiguous!
+
+        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
+
+        m_ids.resize(ggml_nbytes(ids));
+        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
+
+        auto& e = m_stats[wname];
+
+        ++e.ncall;
+
+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0] * n_as, 0);
+            e.counts.resize(src1->ne[0] * n_as, 0);
+        } else if (e.values.size() != (size_t)src1->ne[0] * n_as) {
+            LOG_ERROR("inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0] * n_as);
+            exit(1);  // GGML_ABORT("fatal error");
+        }
+        LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+        // loop over all possible experts, regardless if they are used or not in the batch
+        for (int ex = 0; ex < n_as; ++ex) {
+            size_t e_start = ex * src1->ne[0];
+
+            for (int idx = 0; idx < n_ids; ++idx) {
+                for (int row = 0; row < (int)src1->ne[2]; ++row) {
+                    const int excur = *(const int32_t*)(m_ids.data() + row * ids->nb[1] + idx * ids->nb[0]);
+
+                    GGML_ASSERT(excur >= 0 && excur < n_as);  // sanity check
+
+                    if (excur != ex)
+                        continue;
+
+                    const int64_t i11 = idx % src1->ne[1];
+                    const int64_t i12 = row;
+                    const float* x    = (const float*)((const char*)data + i11 * src1->nb[1] + i12 * src1->nb[2]);
+
+                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                        e.values[e_start + j] += x[j] * x[j];
+                        e.counts[e_start + j]++;
+                        if (!std::isfinite(e.values[e_start + j])) {
+                            LOG_INFO("\n");
+                            LOG_ERROR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                            exit(1);
+                        }
+                    }
+                }
+            }
+            // if (e.ncall > m_last_call) {
+            //     m_last_call = e.ncall;
+            //     if (m_last_call % m_params.n_out_freq == 0) {
+            //         save_imatrix();
+            //     }
+            //     if (m_params.n_save_freq > 0 && m_last_call % m_params.n_save_freq == 0) {
+            //         save_imatrix(m_last_call);
+            //     }
+            // }
+        }
+    } else {
+        auto& e = m_stats[wname];
+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0], 0);
+            e.counts.resize(src1->ne[0], 0);
+        } else if (e.values.size() != (size_t)src1->ne[0]) {
+            LOG_ERROR("inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+            exit(1);  // GGML_ABORT("fatal error");
+        }
+
+        ++e.ncall;
+        // LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+        for (int row = 0; row < (int)src1->ne[1]; ++row) {
+            const float* x = data + row * src1->ne[0];
+            for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                e.values[j] += x[j] * x[j];
+                e.counts[j]++;
+                if (!std::isfinite(e.values[j])) {
+                    LOG_ERROR("%f detected in %s\n", e.values[j], wname.c_str());
+                    exit(1);
+                }
+            }
+        }
+
+        // if (e.ncall > m_last_call) {
+        //     m_last_call = e.ncall;
+        //     if (m_last_call % m_params.n_out_freq == 0 && m_last_call > 0) {
+        //         save_imatrix();
+        //     }
+        //     if (m_params.n_save_freq > 0 && m_last_call % m_params.n_save_freq == 0 && m_last_call > 0) {
+        //         save_imatrix(m_last_call);
+        //     }
+        // }
+    }
+    return true;
+
+}
+
+void IMatrixCollector::save_imatrix(int ncall) const {
+    LOG_INFO("SAVING_IMATRIX...");
+    auto fname = m_params.out_file;
+
+    if (ncall > 0) {
+        fname += ".at_";
+        fname += std::to_string(ncall);
+    }
+    // avoid writing imatrix entries that do not have full data
+    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
+
+    int n_entries = 0;
+    std::vector<std::string> to_store;
+
+    bool is_first = true;  // for printing
+    for (const auto& kv : m_stats) {
+        const int n_all = kv.second.counts.size();
+
+        if (n_all == 0) {
+            continue;
+        }
+
+        int n_zeros = 0;
+        for (const int c : kv.second.counts) {
+            if (c == 0) {
+                n_zeros++;
+            }
+        }
+
+        if (n_zeros != 0 && is_first) {
+            LOG_INFO("\n");
+            is_first = false;
+        }
+
+        if (n_zeros == n_all) {
+            LOG_WARN("entry '%40s' has no data - skipping\n", kv.first.c_str());
+            continue;
+        }
+
+        if (n_zeros > 0) {
+            LOG_WARN("entry '%40s' has partial data (%.2f%%) - skipping\n", kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            continue;
+        }
+
+        n_entries++;
+        to_store.push_back(kv.first);
+    }
+
+    if (to_store.size() < m_stats.size()) {
+        LOG_WARN("storing only %zu out of %zu entries\n", to_store.size(), m_stats.size());
+    }
+
+    std::ofstream out(fname, std::ios::binary);
+    out.write((const char*)&n_entries, sizeof(n_entries));
+    for (const auto& name : to_store) {
+        const auto& stat = m_stats.at(name);
+        int len          = name.size();
+        out.write((const char*)&len, sizeof(len));
+        out.write(name.c_str(), len);
+        out.write((const char*)&stat.ncall, sizeof(stat.ncall));
+        int nval = stat.values.size();
+        out.write((const char*)&nval, sizeof(nval));
+        if (nval > 0) {
+            std::vector<float> tmp(nval);
+            for (int i = 0; i < nval; i++) {
+                tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
+            }
+            out.write((const char*)tmp.data(), nval * sizeof(float));
+        }
+    }
+
+    // Write the number of call the matrix was computed with
+    out.write((const char*)&m_last_call, sizeof(m_last_call));
+
+    LOG_DEBUG("\n");
+    LOG_DEBUG("stored collected data after %d chunks in %s\n", m_last_call, fname.c_str());
+}
+
+bool IMatrixCollector::load_imatrix(const char* fname) {
+    std::ifstream in(fname, std::ios::binary);
+    if (!in) {
+        LOG_ERROR("failed to open %s\n", fname);
+        return false;
+    }
+    int n_entries;
+    in.read((char*)&n_entries, sizeof(n_entries));
+    if (in.fail() || n_entries < 1) {
+        LOG_ERROR("no data in file %s\n", fname);
+        return false;
+    }
+    for (int i = 0; i < n_entries; ++i) {
+        int len;
+        in.read((char*)&len, sizeof(len));
+        std::vector<char> name_as_vec(len + 1);
+        in.read((char*)name_as_vec.data(), len);
+        if (in.fail()) {
+            LOG_ERROR("failed reading name for entry %d from %s\n", i + 1, fname);
+            return false;
+        }
+        name_as_vec[len] = 0;
+        std::string name{name_as_vec.data()};
+        auto& e = m_stats[std::move(name)];
+        int ncall;
+        in.read((char*)&ncall, sizeof(ncall));
+        int nval;
+        in.read((char*)&nval, sizeof(nval));
+        if (in.fail() || nval < 1) {
+            LOG_ERROR("failed reading number of values for entry %d\n", i);
+            m_stats = {};
+            return false;
+        }
+
+        if (e.values.empty()) {
+            e.values.resize(nval, 0);
+            e.counts.resize(nval, 0);
+        }
+
+        std::vector<float> tmp(nval);
+        in.read((char*)tmp.data(), nval * sizeof(float));
+        if (in.fail()) {
+            LOG_ERROR("failed reading data for entry %d\n", i);
+            m_stats = {};
+            return false;
+        }
+
+        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
+        for (int i = 0; i < nval; i++) {
+            e.values[i] += tmp[i];
+            e.counts[i] += ncall;
+        }
+        e.ncall += ncall;
+    }
+    return true;
+}
\ No newline at end of file
diff --git a/model.cpp b/model.cpp
index 24da39f6d..29d39e300 100644
--- a/model.cpp
+++ b/model.cpp
@@ -737,7 +737,8 @@ void convert_tensor(void* src,
                     void* dst,
                     ggml_type dst_type,
                     int nrows,
-                    int n_per_row) {
+                    int n_per_row,
+                    std::vector<float> imatrix = {}) {
     int n = nrows * n_per_row;
     if (src_type == dst_type) {
         size_t nbytes = n * ggml_type_size(src_type) / ggml_blck_size(src_type);
@@ -746,7 +747,10 @@ void convert_tensor(void* src,
         if (dst_type == GGML_TYPE_F16) {
             ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n);
         } else {
-            std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
+            // if(imatrix.size() != 0){
+            //     LOG_INFO("using imatrix");
+            // }
+            imatrix.resize(n_per_row, 1.0f);
             const float* im = imatrix.data();
             ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, im);
         }
@@ -776,7 +780,10 @@ void convert_tensor(void* src,
         if (dst_type == GGML_TYPE_F16) {
             ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n);
         } else {
-            std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
+            // if(imatrix.size() != 0){
+            //     LOG_INFO("using imatrix");
+            // }
+            imatrix.resize(n_per_row, 1.0f);
             const float* im = imatrix.data();
             ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, im);
         }
@@ -1707,7 +1714,7 @@ std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& v
     return res;
 }
 
-bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) {
+bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend,std::unordered_map<std::string, std::vector<float>> imatrix_data) {
     std::vector<TensorStorage> processed_tensor_storages;
     for (auto& tensor_storage : tensor_storages) {
         // LOG_DEBUG("%s", name.c_str());
@@ -1830,8 +1837,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                         f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
                     }
 
+                    auto processed_name = convert_tensor_name(tensor_storage.name);
+                    // LOG_DEBUG("%s",processed_name.c_str());
+                    std::vector<float> imatrix = imatrix_data[processed_name];
+
                     convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
-                                   dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
+                                   dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0],imatrix);
                 }
             } else {
                 read_buffer.resize(tensor_storage.nbytes());
@@ -1853,6 +1864,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                     ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
                 } else {
                     // convert first, then copy to device memory
+                    auto processed_name = convert_tensor_name(tensor_storage.name);
+                    // LOG_DEBUG("%s",processed_name.c_str());
+                    std::vector<float> imatrix = imatrix_data[processed_name];
+
                     convert_buffer.resize(ggml_nbytes(dst_tensor));
                     convert_tensor((void*)read_buffer.data(), tensor_storage.type,
                                    (void*)convert_buffer.data(), dst_tensor->type,
@@ -1917,7 +1932,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
         return true;
     };
 
-    bool success = load_tensors(on_new_tensor_cb, backend);
+    bool success = load_tensors(on_new_tensor_cb, backend, {});
     if (!success) {
         LOG_ERROR("load tensors from file failed");
         return false;
@@ -1977,7 +1992,7 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
     return false;
 }
 
-bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
+bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, std::unordered_map<std::string, std::vector<float>> imatrix_data) {
     auto backend    = ggml_backend_cpu_init();
     size_t mem_size = 1 * 1024 * 1024;  // for padding
     mem_size += tensor_storages.size() * ggml_tensor_overhead();
@@ -2015,7 +2030,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
         return true;
     };
 
-    bool success = load_tensors(on_new_tensor_cb, backend);
+    bool success = load_tensors(on_new_tensor_cb, backend, imatrix_data);
     ggml_backend_free(backend);
     LOG_INFO("load tensors done");
     LOG_INFO("trying to save tensors to %s", file_path.c_str());
@@ -2051,7 +2066,54 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
     return mem_size;
 }
 
-bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
+static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
+    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
+    if (!in) {
+        LOG_ERROR("%s: failed to open %s\n", imatrix_file.c_str());
+        exit(1);
+    }
+    int n_entries;
+    in.read((char*)&n_entries, sizeof(n_entries));
+    if (in.fail() || n_entries < 1) {
+        LOG_ERROR("%s: no data in file %s\n", imatrix_file.c_str());
+        exit(1);
+    }
+    for (int i = 0; i < n_entries; ++i) {
+        int len;
+        in.read((char*)&len, sizeof(len));
+        std::vector<char> name_as_vec(len + 1);
+        in.read((char*)name_as_vec.data(), len);
+        if (in.fail()) {
+            LOG_ERROR("%s: failed reading name for entry %d from %s\n", i + 1, imatrix_file.c_str());
+            exit(1);
+        }
+        name_as_vec[len] = 0;
+        std::string name{name_as_vec.data()};
+        auto& e = imatrix_data[name];
+        int ncall;
+        in.read((char*)&ncall, sizeof(ncall));
+        int nval;
+        in.read((char*)&nval, sizeof(nval));
+        if (in.fail() || nval < 1) {
+            LOG_ERROR("%s: failed reading number of values for entry %d\n", i);
+            imatrix_data = {};
+            exit(1);
+        }
+        e.resize(nval);
+        in.read((char*)e.data(), nval * sizeof(float));
+        if (in.fail()) {
+            LOG_ERROR("%s: failed reading data for entry %d\n", i);
+            imatrix_data = {};
+            exit(1);
+        }
+        if (ncall > 0) {
+            for (auto& v : e)
+                v /= ncall;
+        }
+    }
+}
+
+bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, const char* imatrix_path = NULL) {
     ModelLoader model_loader;
 
     if (!model_loader.init_from_file(input_path)) {
@@ -2065,6 +2127,17 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
             return false;
         }
     }
-    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type);
+
+    std::unordered_map<std::string, std::vector<float>> imatrix_data = {};
+
+    if(imatrix_path){
+        load_imatrix(imatrix_path, imatrix_data);
+    }
+
+    // for (const auto& pair : imatrix_data) {
+    //     LOG_DEBUG("imatrix key : %s", pair.first.c_str());
+    // }
+
+    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, imatrix_data);
     return success;
 }
diff --git a/model.h b/model.h
index d7f976533..f8179643c 100644
--- a/model.h
+++ b/model.h
@@ -216,12 +216,12 @@ class ModelLoader {
     ggml_type get_diffusion_model_wtype();
     ggml_type get_vae_wtype();
     void set_wtype_override(ggml_type wtype, std::string prefix = "");
-    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
+    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend, std::unordered_map<std::string, std::vector<float>> imatrix_data = {});
     bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                       ggml_backend_t backend,
                       std::set<std::string> ignore_tensors = {});
 
-    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
+    bool save_to_gguf_file(const std::string& file_path, ggml_type type, std::unordered_map<std::string, std::vector<float>>);
     bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
     int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
     ~ModelLoader() = default;
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 52dcc848a..d5a2b2f48 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -114,9 +114,11 @@ enum sd_log_level_t {
 
 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
+typedef bool (*sd_graph_eval_callback_t)(struct ggml_tensor * t, bool ask, void * user_data);
 
 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
+SD_API void sd_set_backend_eval_callback(sd_graph_eval_callback_t cb, void * data);
 SD_API int32_t get_num_physical_cores();
 SD_API const char* sd_get_system_info();
 
@@ -228,7 +230,7 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
 
-SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type);
+SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, const char* imatrix_path);
 
 SD_API uint8_t* preprocess_canny(uint8_t* img,
                                  int width,
diff --git a/util.cpp b/util.cpp
index da11a14d6..3c1779546 100644
--- a/util.cpp
+++ b/util.cpp
@@ -247,6 +247,9 @@ int32_t get_num_physical_cores() {
 static sd_progress_cb_t sd_progress_cb = NULL;
 void* sd_progress_cb_data              = NULL;
 
+static ggml_graph_eval_callback callback_eval = NULL;
+void * callback_eval_user_data = NULL;
+
 std::u32string utf8_to_utf32(const std::string& utf8_str) {
     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
     return converter.from_bytes(utf8_str);
@@ -420,6 +423,20 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
     sd_progress_cb      = cb;
     sd_progress_cb_data = data;
 }
+
+void sd_set_backend_eval_callback(ggml_graph_eval_callback cb, void * data){
+    callback_eval = cb;
+    callback_eval_user_data = data;
+}
+
+ggml_graph_eval_callback get_callback_eval(){
+    return callback_eval;
+}
+
+void* get_callback_eval_user_data() {
+    return callback_eval_user_data;
+}
+
 const char* sd_get_system_info() {
     static char buffer[1024];
     std::stringstream ss;
diff --git a/util.h b/util.h
index 14fa812e5..f23cc5def 100644
--- a/util.h
+++ b/util.h
@@ -7,6 +7,8 @@
 
 #include "stable-diffusion.h"
 
+typedef bool (*ggml_graph_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
+
 bool ends_with(const std::string& str, const std::string& ending);
 bool starts_with(const std::string& str, const std::string& start);
 bool contains(const std::string& str, const std::string& substr);
@@ -53,6 +55,8 @@ void log_printf(sd_log_level_t level, const char* file, int line, const char* fo
 std::string trim(const std::string& s);
 
 std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
+ggml_graph_eval_callback get_callback_eval();
+void* get_callback_eval_user_data();
 
 #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)

From ae2cbce7d135b10f6579993085ca5028d9a690b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 23 Mar 2025 19:30:06 +0100
Subject: [PATCH 02/15] Refactor imatrix implementation into main example

---
 examples/CMakeLists.txt         |    3 +-
 examples/cli/main.cpp           |   56 +-
 examples/imatrix/CMakeLists.txt |    7 -
 examples/imatrix/imatrix.cpp    | 1089 -------------------------------
 imatrix.hpp                     |   45 +-
 model.cpp                       |   78 +--
 model.h                         |    4 +-
 stable-diffusion.h              |    3 +-
 8 files changed, 88 insertions(+), 1197 deletions(-)
 delete mode 100644 examples/imatrix/CMakeLists.txt
 delete mode 100644 examples/imatrix/imatrix.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d1806c7ae..81053f9e2 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,4 +1,3 @@
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
-add_subdirectory(cli)
-add_subdirectory(imatrix)
\ No newline at end of file
+add_subdirectory(cli)
\ No newline at end of file
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 5cbd0320c..d2f407562 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -22,6 +22,10 @@
 #define STB_IMAGE_RESIZE_STATIC
 #include "stb_image_resize.h"
 
+#define IMATRIX_IMPL
+#include "imatrix.hpp"
+static IMatrixCollector g_collector;
+
 const char* rng_type_to_str[] = {
     "std_default",
     "cuda",
@@ -129,6 +133,12 @@ struct SDParams {
     float slg_scale              = 0.f;
     float skip_layer_start       = 0.01f;
     float skip_layer_end         = 0.2f;
+
+    /* Imatrix params */
+
+    std::string imatrix_out = "";
+
+    std::vector<std::string> imatrix_in = {};
 };
 
 void print_params(SDParams params) {
@@ -204,6 +214,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
     printf("  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n");
     printf("                                     If not specified, the default is the type of the weight file\n");
+    printf("  --imat-out [PATH]                  If set, compute the imatrix for this run and save it to the provided path");
+    printf("  --imat-in [PATH]                   Use imatrix for quantization.");
     printf("  --lora-model-dir [DIR]             lora model directory\n");
     printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
     printf("  --mask [MASK]                      path to the mask image, required by img2img with mask\n");
@@ -629,6 +641,18 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.skip_layer_end = std::stof(argv[i]);
+        } else if (arg == "--imat-out") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.imatrix_out = argv[i];
+        } else if (arg == "--imat-in") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.imatrix_in.push_back(std::string(argv[i]));
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             print_usage(argc, argv);
@@ -787,6 +811,10 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
     fflush(out_stream);
 }
 
+static bool collect_imatrix(struct ggml_tensor* t, bool ask, void* user_data) {
+    return g_collector.collect_imatrix(t, ask, user_data);
+}
+
 int main(int argc, const char* argv[]) {
     SDParams params;
 
@@ -799,8 +827,21 @@ int main(int argc, const char* argv[]) {
         printf("%s", sd_get_system_info());
     }
 
+    if (params.imatrix_out != "") {
+        sd_set_backend_eval_callback((sd_graph_eval_callback_t)collect_imatrix, &params);
+    }
+    if (params.imatrix_out != "" || params.mode == CONVERT || params.wtype != SD_TYPE_COUNT) {
+        setConvertImatrixCollector((void*)&g_collector);
+        for (const auto& in_file : params.imatrix_in) {
+            printf("loading imatrix from '%s'\n", in_file.c_str());
+            if (!g_collector.load_imatrix(in_file.c_str())) {
+                printf("Failed to load %s\n", in_file.c_str());
+            }
+        }
+    }
+
     if (params.mode == CONVERT) {
-        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype,NULL);
+        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype);
         if (!success) {
             fprintf(stderr,
                     "convert '%s'/'%s' to '%s' failed\n",
@@ -1075,11 +1116,11 @@ int main(int argc, const char* argv[]) {
 
     std::string dummy_name, ext, lc_ext;
     bool is_jpg;
-    size_t last = params.output_path.find_last_of(".");
+    size_t last      = params.output_path.find_last_of(".");
     size_t last_path = std::min(params.output_path.find_last_of("/"),
                                 params.output_path.find_last_of("\\"));
-    if (last != std::string::npos // filename has extension
-    && (last_path == std::string::npos || last > last_path)) {
+    if (last != std::string::npos  // filename has extension
+        && (last_path == std::string::npos || last > last_path)) {
         dummy_name = params.output_path.substr(0, last);
         ext = lc_ext = params.output_path.substr(last);
         std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower);
@@ -1087,7 +1128,7 @@ int main(int argc, const char* argv[]) {
     } else {
         dummy_name = params.output_path;
         ext = lc_ext = "";
-        is_jpg = false;
+        is_jpg       = false;
     }
     // appending ".png" to absent or unknown extension
     if (!is_jpg && lc_ext != ".png") {
@@ -1099,7 +1140,7 @@ int main(int argc, const char* argv[]) {
             continue;
         }
         std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
-        if(is_jpg) {
+        if (is_jpg) {
             stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
                            results[i].data, 90, get_image_params(params, params.seed + i).c_str());
             printf("save result JPEG image to '%s'\n", final_image_path.c_str());
@@ -1111,6 +1152,9 @@ int main(int argc, const char* argv[]) {
         free(results[i].data);
         results[i].data = NULL;
     }
+    if (params.imatrix_out != "") {
+        g_collector.save_imatrix(params.imatrix_out);
+    }
     free(results);
     free_sd_ctx(sd_ctx);
     free(control_image_buffer);
diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt
deleted file mode 100644
index 4c5e46939..000000000
--- a/examples/imatrix/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-set(TARGET sd-imatrix)
-
-add_executable(${TARGET} imatrix.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
-target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
\ No newline at end of file
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
deleted file mode 100644
index a974bb1d7..000000000
--- a/examples/imatrix/imatrix.cpp
+++ /dev/null
@@ -1,1089 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <time.h>
-#include <iostream>
-#include <random>
-#include <regex>
-#include <string>
-#include <vector>
-
-#include "stable-diffusion.h"
-
-#define STB_IMAGE_IMPLEMENTATION
-#define STB_IMAGE_STATIC
-#include "stb_image.h"
-
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#define STB_IMAGE_WRITE_STATIC
-#include "stb_image_write.h"
-
-#define STB_IMAGE_RESIZE_IMPLEMENTATION
-#define STB_IMAGE_RESIZE_STATIC
-#include "stb_image_resize.h"
-
-const char* rng_type_to_str[] = {
-    "std_default",
-    "cuda",
-};
-
-// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
-const char* sample_method_str[] = {
-    "euler_a",
-    "euler",
-    "heun",
-    "dpm2",
-    "dpm++2s_a",
-    "dpm++2m",
-    "dpm++2mv2",
-    "ipndm",
-    "ipndm_v",
-    "lcm",
-    "ddim_trailing",
-    "tcd",
-};
-
-// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
-const char* schedule_str[] = {
-    "default",
-    "discrete",
-    "karras",
-    "exponential",
-    "ays",
-    "gits",
-};
-
-const char* modes_str[] = {
-    "txt2img",
-    "img2img",
-    "img2vid",
-    "convert",
-};
-
-const char* previews_str[] = {
-    "none",
-    "proj",
-    "tae",
-    "vae",
-};
-
-enum SDMode {
-    TXT2IMG,
-    IMG2IMG,
-    IMG2VID,
-    CONVERT,
-    MODE_COUNT
-};
-
-struct SDParams {
-    int n_threads = -1;
-    SDMode mode   = TXT2IMG;
-    std::string model_path;
-    std::string clip_l_path;
-    std::string clip_g_path;
-    std::string t5xxl_path;
-    std::string diffusion_model_path;
-    std::string vae_path;
-    std::string taesd_path;
-    std::string esrgan_path;
-    std::string controlnet_path;
-    std::string embeddings_path;
-    std::string stacked_id_embeddings_path;
-    std::string input_id_images_path;
-    sd_type_t wtype = SD_TYPE_COUNT;
-    std::string lora_model_dir;
-    std::string output_path = "output.png";
-    std::string input_path;
-    std::string mask_path;
-    std::string control_image_path;
-
-    std::string prompt;
-    std::string negative_prompt;
-    float min_cfg     = 1.0f;
-    float cfg_scale   = 7.0f;
-    float guidance    = 3.5f;
-    float eta         = 0.f;
-    float style_ratio = 20.f;
-    int clip_skip     = -1;  // <= 0 represents unspecified
-    int width         = 512;
-    int height        = 512;
-    int batch_count   = 1;
-
-    int video_frames         = 6;
-    int motion_bucket_id     = 127;
-    int fps                  = 6;
-    float augmentation_level = 0.f;
-
-    sample_method_t sample_method = EULER_A;
-    schedule_t schedule           = DEFAULT;
-    int sample_steps              = 20;
-    float strength                = 0.75f;
-    float control_strength        = 0.9f;
-    rng_type_t rng_type           = CUDA_RNG;
-    int64_t seed                  = 42;
-    bool verbose                  = false;
-    bool vae_tiling               = false;
-    bool control_net_cpu          = false;
-    bool normalize_input          = false;
-    bool clip_on_cpu              = false;
-    bool vae_on_cpu               = false;
-    bool diffusion_flash_attn     = false;
-    bool canny_preprocess         = false;
-    bool color                    = false;
-    int upscale_repeats           = 1;
-
-    std::vector<int> skip_layers = {7, 8, 9};
-    float slg_scale              = 0.0f;
-    float skip_layer_start       = 0.01f;
-    float skip_layer_end         = 0.2f;
-
-    /* Imatrix params */
-
-    bool process_output = false;
-    int n_out_freq      = 0;
-    int n_save_freq     = 0;
-
-    std::string out_file = "imatrix.dat";
-
-    std::vector<std::string> in_files = {};
-};
-
-#include "imatrix.hpp"
-
-static IMatrixCollector g_collector;
-
-/* Enables Printing the log level tag in color using ANSI escape codes */
-void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
-    SDParams* params = (SDParams*)data;
-    int tag_color;
-    const char* level_str;
-    FILE* out_stream = (level == SD_LOG_ERROR) ? stderr : stdout;
-
-    if (!log || (!params->verbose && level <= SD_LOG_DEBUG)) {
-        return;
-    }
-
-    switch (level) {
-        case SD_LOG_DEBUG:
-            tag_color = 37;
-            level_str = "DEBUG";
-            break;
-        case SD_LOG_INFO:
-            tag_color = 34;
-            level_str = "INFO";
-            break;
-        case SD_LOG_WARN:
-            tag_color = 35;
-            level_str = "WARN";
-            break;
-        case SD_LOG_ERROR:
-            tag_color = 31;
-            level_str = "ERROR";
-            break;
-        default: /* Potential future-proofing */
-            tag_color = 33;
-            level_str = "?????";
-            break;
-    }
-
-    if (params->color == true) {
-        fprintf(out_stream, "\033[%d;1m[%-5s]\033[0m ", tag_color, level_str);
-    } else {
-        fprintf(out_stream, "[%-5s] ", level_str);
-    }
-    fputs(log, out_stream);
-    fflush(out_stream);
-}
-void print_params(SDParams params) {
-    (void)params;
-}
-
-void print_usage(int, const char** argv) {
-    printf("\nexample usage:\n");
-    printf(
-        "\n    %s \\\n"
-        "       {same as sd.exe} [-O imatrix.dat]\\\n"
-        "       [--output-frequency 10] [--save-frequency 0] \\\n"
-        "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n",
-        argv[0]);
-    printf("\n");
-}
-
-void parse_args(int argc, const char** argv, SDParams& params) {
-    bool invalid_arg = false;
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
-        } else if (arg == "-M" || arg == "--mode") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            const char* mode_selected = argv[i];
-            int mode_found            = -1;
-            for (int d = 0; d < MODE_COUNT; d++) {
-                if (!strcmp(mode_selected, modes_str[d])) {
-                    mode_found = d;
-                }
-            }
-            if (mode_found == -1) {
-                fprintf(stderr,
-                        "error: invalid mode %s, must be one of [txt2img, img2img, img2vid, convert]\n",
-                        mode_selected);
-                exit(1);
-            }
-            params.mode = (SDMode)mode_found;
-        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.model_path = argv[i];
-        } else if (arg == "--clip_l") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.clip_l_path = argv[i];
-        } else if (arg == "--clip_g") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.clip_g_path = argv[i];
-        } else if (arg == "--t5xxl") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.t5xxl_path = argv[i];
-        } else if (arg == "--diffusion-model") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.diffusion_model_path = argv[i];
-        } else if (arg == "--vae") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.vae_path = argv[i];
-        } else if (arg == "--taesd") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.taesd_path = argv[i];
-        } else if (arg == "--control-net") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.controlnet_path = argv[i];
-        } else if (arg == "--upscale-model") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.esrgan_path = argv[i];
-        } else if (arg == "--embd-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.embeddings_path = argv[i];
-        } else if (arg == "--stacked-id-embd-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.stacked_id_embeddings_path = argv[i];
-        } else if (arg == "--input-id-images-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.input_id_images_path = argv[i];
-        } else if (arg == "--type") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            std::string type        = argv[i];
-            bool found              = false;
-            std::string valid_types = "";
-            for (size_t i = 0; i < SD_TYPE_COUNT; i++) {
-                auto trait = ggml_get_type_traits((ggml_type)i);
-                std::string name(trait->type_name);
-                if (name == "f32" || trait->to_float && trait->type_size) {
-                    if (i)
-                        valid_types += ", ";
-                    valid_types += name;
-                    if (type == name) {
-                        if (ggml_quantize_requires_imatrix((ggml_type)i)) {
-                            printf("\033[35;1m[WARNING]\033[0m: type %s requires imatrix to work properly. A dummy imatrix will be used, expect poor quality.\n", trait->type_name);
-                        }
-                        params.wtype = (enum sd_type_t)i;
-                        found        = true;
-                        break;
-                    }
-                }
-            }
-            if (!found) {
-                fprintf(stderr, "error: invalid weight format %s, must be one of [%s]\n",
-                        type.c_str(),
-                        valid_types.c_str());
-                exit(1);
-            }
-        } else if (arg == "--lora-model-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.lora_model_dir = argv[i];
-        } else if (arg == "-i" || arg == "--init-img") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.input_path = argv[i];
-        } else if (arg == "--mask") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.mask_path = argv[i];
-        } else if (arg == "--control-image") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.control_image_path = argv[i];
-        } else if (arg == "-o" || arg == "--output") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.output_path = argv[i];
-        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.prompt = argv[i];
-        } else if (arg == "--upscale-repeats") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.upscale_repeats = std::stoi(argv[i]);
-            if (params.upscale_repeats < 1) {
-                fprintf(stderr, "error: upscale multiplier must be at least 1\n");
-                exit(1);
-            }
-        } else if (arg == "-n" || arg == "--negative-prompt") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.negative_prompt = argv[i];
-        } else if (arg == "--cfg-scale") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.cfg_scale = std::stof(argv[i]);
-        } else if (arg == "--guidance") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.guidance = std::stof(argv[i]);
-        } else if (arg == "--eta") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.eta = std::stof(argv[i]);
-        } else if (arg == "--strength") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.strength = std::stof(argv[i]);
-        } else if (arg == "--style-ratio") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.style_ratio = std::stof(argv[i]);
-        } else if (arg == "--control-strength") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.control_strength = std::stof(argv[i]);
-        } else if (arg == "-H" || arg == "--height") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.height = std::stoi(argv[i]);
-        } else if (arg == "-W" || arg == "--width") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.width = std::stoi(argv[i]);
-        } else if (arg == "--steps") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.sample_steps = std::stoi(argv[i]);
-        } else if (arg == "--clip-skip") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.clip_skip = std::stoi(argv[i]);
-        } else if (arg == "--vae-tiling") {
-            params.vae_tiling = true;
-        } else if (arg == "--control-net-cpu") {
-            params.control_net_cpu = true;
-        } else if (arg == "--normalize-input") {
-            params.normalize_input = true;
-        } else if (arg == "--clip-on-cpu") {
-            params.clip_on_cpu = true;  // will slow down get_learned_condiotion but necessary for low MEM GPUs
-        } else if (arg == "--vae-on-cpu") {
-            params.vae_on_cpu = true;  // will slow down latent decoding but necessary for low MEM GPUs
-        } else if (arg == "--diffusion-fa") {
-            params.diffusion_flash_attn = true;  // can reduce MEM significantly
-        } else if (arg == "--canny") {
-            params.canny_preprocess = true;
-        } else if (arg == "-b" || arg == "--batch-count") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.batch_count = std::stoi(argv[i]);
-        } else if (arg == "--rng") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            std::string rng_type_str = argv[i];
-            if (rng_type_str == "std_default") {
-                params.rng_type = STD_DEFAULT_RNG;
-            } else if (rng_type_str == "cuda") {
-                params.rng_type = CUDA_RNG;
-            } else {
-                invalid_arg = true;
-                break;
-            }
-        } else if (arg == "--schedule") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            const char* schedule_selected = argv[i];
-            int schedule_found            = -1;
-            for (int d = 0; d < N_SCHEDULES; d++) {
-                if (!strcmp(schedule_selected, schedule_str[d])) {
-                    schedule_found = d;
-                }
-            }
-            if (schedule_found == -1) {
-                invalid_arg = true;
-                break;
-            }
-            params.schedule = (schedule_t)schedule_found;
-        } else if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.seed = std::stoll(argv[i]);
-        } else if (arg == "--sampling-method") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            const char* sample_method_selected = argv[i];
-            int sample_method_found            = -1;
-            for (int m = 0; m < N_SAMPLE_METHODS; m++) {
-                if (!strcmp(sample_method_selected, sample_method_str[m])) {
-                    sample_method_found = m;
-                }
-            }
-            if (sample_method_found == -1) {
-                invalid_arg = true;
-                break;
-            }
-            params.sample_method = (sample_method_t)sample_method_found;
-        } else if (arg == "-h" || arg == "--help") {
-            print_usage(argc, argv);
-            exit(0);
-        } else if (arg == "-v" || arg == "--verbose") {
-            params.verbose = true;
-        } else if (arg == "--color") {
-            params.color = true;
-        } else if (arg == "--slg-scale") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.slg_scale = std::stof(argv[i]);
-        } else if (arg == "--skip-layers") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            if (argv[i][0] != '[') {
-                invalid_arg = true;
-                break;
-            }
-            std::string layers_str = argv[i];
-            while (layers_str.back() != ']') {
-                if (++i >= argc) {
-                    invalid_arg = true;
-                    break;
-                }
-                layers_str += " " + std::string(argv[i]);
-            }
-            layers_str = layers_str.substr(1, layers_str.size() - 2);
-
-            std::regex regex("[, ]+");
-            std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1);
-            std::sregex_token_iterator end;
-            std::vector<std::string> tokens(iter, end);
-            std::vector<int> layers;
-            for (const auto& token : tokens) {
-                try {
-                    layers.push_back(std::stoi(token));
-                } catch (const std::invalid_argument& e) {
-                    invalid_arg = true;
-                    break;
-                }
-            }
-            params.skip_layers = layers;
-
-            if (invalid_arg) {
-                break;
-            }
-        } else if (arg == "--skip-layer-start") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.skip_layer_start = std::stof(argv[i]);
-        } else if (arg == "--skip-layer-end") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.skip_layer_end = std::stof(argv[i]);
-        } else if (arg == "-O") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.out_file = argv[i];
-        } else if (arg == "--output-frequency") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.n_out_freq = std::stoi(argv[i]);
-        } else if (arg == "--save-frequency") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.n_out_freq = std::stoi(argv[i]);
-        } else if (arg == "--in-file") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.in_files.push_back(std::string(argv[i]));
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            print_usage(argc, argv);
-            exit(1);
-        }
-    }
-    if (invalid_arg) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        print_usage(argc, argv);
-        exit(1);
-    }
-    if (params.n_threads <= 0) {
-        params.n_threads = get_num_physical_cores();
-    }
-
-    if (params.mode != CONVERT && params.mode != IMG2VID && params.prompt.length() == 0) {
-        fprintf(stderr, "error: the following arguments are required: prompt\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
-
-    if (params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) {
-        fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
-
-    if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
-        fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
-
-    if (params.output_path.length() == 0) {
-        fprintf(stderr, "error: the following arguments are required: output_path\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
-
-    if (params.width <= 0 || params.width % 64 != 0) {
-        fprintf(stderr, "error: the width must be a multiple of 64\n");
-        exit(1);
-    }
-
-    if (params.height <= 0 || params.height % 64 != 0) {
-        fprintf(stderr, "error: the height must be a multiple of 64\n");
-        exit(1);
-    }
-
-    if (params.sample_steps <= 0) {
-        fprintf(stderr, "error: the sample_steps must be greater than 0\n");
-        exit(1);
-    }
-
-    if (params.strength < 0.f || params.strength > 1.f) {
-        fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
-        exit(1);
-    }
-
-    if (params.seed < 0) {
-        srand((int)time(NULL));
-        params.seed = rand();
-    }
-
-    if (params.mode == CONVERT) {
-        if (params.output_path == "output.png") {
-            params.output_path = "output.gguf";
-        }
-    }
-}
-
-static std::string sd_basename(const std::string& path) {
-    size_t pos = path.find_last_of('/');
-    if (pos != std::string::npos) {
-        return path.substr(pos + 1);
-    }
-    pos = path.find_last_of('\\');
-    if (pos != std::string::npos) {
-        return path.substr(pos + 1);
-    }
-    return path;
-}
-
-std::string get_image_params(SDParams params, int64_t seed) {
-    std::string parameter_string = params.prompt + "\n";
-    if (params.negative_prompt.size() != 0) {
-        parameter_string += "Negative prompt: " + params.negative_prompt + "\n";
-    }
-    parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
-    parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
-    if (params.slg_scale != 0 && params.skip_layers.size() != 0) {
-        parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", ";
-        parameter_string += "Skip layers: [";
-        for (const auto& layer : params.skip_layers) {
-            parameter_string += std::to_string(layer) + ", ";
-        }
-        parameter_string += "], ";
-        parameter_string += "Skip layer start: " + std::to_string(params.skip_layer_start) + ", ";
-        parameter_string += "Skip layer end: " + std::to_string(params.skip_layer_end) + ", ";
-    }
-    parameter_string += "Guidance: " + std::to_string(params.guidance) + ", ";
-    parameter_string += "Eta: " + std::to_string(params.eta) + ", ";
-    parameter_string += "Seed: " + std::to_string(seed) + ", ";
-    parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
-    parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
-    parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
-    parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
-    if (params.schedule == KARRAS) {
-        parameter_string += " karras";
-    }
-    parameter_string += ", ";
-    parameter_string += "Version: stable-diffusion.cpp";
-    return parameter_string;
-}
-
-const char* preview_path;
-
-void step_callback(int step, sd_image_t image) {
-    stbi_write_png(preview_path, image.width, image.height, image.channel, image.data, 0);
-}
-
-static bool collect_imatrix(struct ggml_tensor* t, bool ask, void* user_data) {
-    return g_collector.collect_imatrix(t, ask, user_data);
-}
-
-int main(int argc, const char** argv) {
-    SDParams params;
-
-    parse_args(argc, argv, params);
-
-    sd_set_log_callback(sd_log_cb, (void*)&params);
-
-    if (params.verbose) {
-        print_params(params);
-        printf("%s", sd_get_system_info());
-    }
-
-    g_collector.set_params(params);
-
-    for (const auto& in_file : params.in_files) {
-        printf("loading imatrix from '%s'\n", in_file.c_str());
-        if (!g_collector.load_imatrix(in_file.c_str())) {
-            LOG_ERROR("failed to load %s\n", in_file.c_str());
-            return 1;
-        }
-    }
-
-    sd_set_backend_eval_callback((sd_graph_eval_callback_t)collect_imatrix, &params);
-
-    if (params.mode == CONVERT) {
-        const char* imatrix_file = NULL;
-        if (params.in_files.size() > 0) {
-            imatrix_file = params.in_files[0].c_str();
-        }
-        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, imatrix_file);
-        if (!success) {
-            fprintf(stderr,
-                    "convert '%s'/'%s' to '%s' failed\n",
-                    params.model_path.c_str(),
-                    params.vae_path.c_str(),
-                    params.output_path.c_str());
-            return 1;
-        } else {
-            printf("convert '%s'/'%s' to '%s' success\n",
-                   params.model_path.c_str(),
-                   params.vae_path.c_str(),
-                   params.output_path.c_str());
-            return 0;
-        }
-    }
-
-    if (params.mode == IMG2VID) {
-        fprintf(stderr, "SVD support is broken, do not use it!!!\n");
-        return 1;
-    }
-
-    bool vae_decode_only          = true;
-    uint8_t* input_image_buffer   = NULL;
-    uint8_t* control_image_buffer = NULL;
-    uint8_t* mask_image_buffer    = NULL;
-
-    if (params.mode == IMG2IMG || params.mode == IMG2VID) {
-        vae_decode_only = false;
-
-        int c              = 0;
-        int width          = 0;
-        int height         = 0;
-        input_image_buffer = stbi_load(params.input_path.c_str(), &width, &height, &c, 3);
-        if (input_image_buffer == NULL) {
-            fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str());
-            return 1;
-        }
-        if (c < 3) {
-            fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c);
-            free(input_image_buffer);
-            return 1;
-        }
-        if (width <= 0) {
-            fprintf(stderr, "error: the width of image must be greater than 0\n");
-            free(input_image_buffer);
-            return 1;
-        }
-        if (height <= 0) {
-            fprintf(stderr, "error: the height of image must be greater than 0\n");
-            free(input_image_buffer);
-            return 1;
-        }
-
-        // Resize input image ...
-        if (params.height != height || params.width != width) {
-            printf("resize input image from %dx%d to %dx%d\n", width, height, params.width, params.height);
-            int resized_height = params.height;
-            int resized_width  = params.width;
-
-            uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * 3);
-            if (resized_image_buffer == NULL) {
-                fprintf(stderr, "error: allocate memory for resize input image\n");
-                free(input_image_buffer);
-                return 1;
-            }
-            stbir_resize(input_image_buffer, width, height, 0,
-                         resized_image_buffer, resized_width, resized_height, 0, STBIR_TYPE_UINT8,
-                         3 /*RGB channel*/, STBIR_ALPHA_CHANNEL_NONE, 0,
-                         STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP,
-                         STBIR_FILTER_BOX, STBIR_FILTER_BOX,
-                         STBIR_COLORSPACE_SRGB, nullptr);
-
-            // Save resized result
-            free(input_image_buffer);
-            input_image_buffer = resized_image_buffer;
-        }
-    }
-
-    sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
-                                  params.clip_l_path.c_str(),
-                                  params.clip_g_path.c_str(),
-                                  params.t5xxl_path.c_str(),
-                                  params.diffusion_model_path.c_str(),
-                                  params.vae_path.c_str(),
-                                  params.taesd_path.c_str(),
-                                  params.controlnet_path.c_str(),
-                                  params.lora_model_dir.c_str(),
-                                  params.embeddings_path.c_str(),
-                                  params.stacked_id_embeddings_path.c_str(),
-                                  vae_decode_only,
-                                  params.vae_tiling,
-                                  true,
-                                  params.n_threads,
-                                  params.wtype,
-                                  params.rng_type,
-                                  params.schedule,
-                                  params.clip_on_cpu,
-                                  params.control_net_cpu,
-                                  params.vae_on_cpu,
-                                  params.diffusion_flash_attn);
-
-    if (sd_ctx == NULL) {
-        printf("new_sd_ctx_t failed\n");
-        return 1;
-    }
-
-    sd_image_t* control_image = NULL;
-    if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) {
-        int c                = 0;
-        control_image_buffer = stbi_load(params.control_image_path.c_str(), &params.width, &params.height, &c, 3);
-        if (control_image_buffer == NULL) {
-            fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str());
-            return 1;
-        }
-        control_image = new sd_image_t{(uint32_t)params.width,
-                                       (uint32_t)params.height,
-                                       3,
-                                       control_image_buffer};
-        if (params.canny_preprocess) {  // apply preprocessor
-            control_image->data = preprocess_canny(control_image->data,
-                                                   control_image->width,
-                                                   control_image->height,
-                                                   0.08f,
-                                                   0.08f,
-                                                   0.8f,
-                                                   1.0f,
-                                                   false);
-        }
-    }
-
-    std::vector<uint8_t> default_mask_image_vec(params.width * params.height, 255);
-    if (params.mask_path != "") {
-        int c             = 0;
-        mask_image_buffer = stbi_load(params.mask_path.c_str(), &params.width, &params.height, &c, 1);
-    } else {
-        mask_image_buffer = default_mask_image_vec.data();
-    }
-    sd_image_t mask_image = {(uint32_t)params.width,
-                             (uint32_t)params.height,
-                             1,
-                             mask_image_buffer};
-
-    sd_image_t* results;
-    if (params.mode == TXT2IMG) {
-        results = txt2img(sd_ctx,
-                          params.prompt.c_str(),
-                          params.negative_prompt.c_str(),
-                          params.clip_skip,
-                          params.cfg_scale,
-                          params.guidance,
-                          params.eta,
-                          params.width,
-                          params.height,
-                          params.sample_method,
-                          params.sample_steps,
-                          params.seed,
-                          params.batch_count,
-                          control_image,
-                          params.control_strength,
-                          params.style_ratio,
-                          params.normalize_input,
-                          params.input_id_images_path.c_str(),
-                          params.skip_layers.data(),
-                          params.skip_layers.size(),
-                          params.slg_scale,
-                          params.skip_layer_start,
-                          params.skip_layer_end);
-    } else {
-        sd_image_t input_image = {(uint32_t)params.width,
-                                  (uint32_t)params.height,
-                                  3,
-                                  input_image_buffer};
-
-        if (params.mode == IMG2VID) {
-            results = img2vid(sd_ctx,
-                              input_image,
-                              params.width,
-                              params.height,
-                              params.video_frames,
-                              params.motion_bucket_id,
-                              params.fps,
-                              params.augmentation_level,
-                              params.min_cfg,
-                              params.cfg_scale,
-                              params.sample_method,
-                              params.sample_steps,
-                              params.strength,
-                              params.seed);
-            if (results == NULL) {
-                printf("generate failed\n");
-                free_sd_ctx(sd_ctx);
-                return 1;
-            }
-            size_t last            = params.output_path.find_last_of(".");
-            std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
-            for (int i = 0; i < params.video_frames; i++) {
-                if (results[i].data == NULL) {
-                    continue;
-                }
-                std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
-                stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
-                               results[i].data, 0, get_image_params(params, params.seed + i).c_str());
-                printf("save result image to '%s'\n", final_image_path.c_str());
-                free(results[i].data);
-                results[i].data = NULL;
-            }
-            free(results);
-            free_sd_ctx(sd_ctx);
-            return 0;
-        } else {
-            results = img2img(sd_ctx,
-                              input_image,
-                              mask_image,
-                              params.prompt.c_str(),
-                              params.negative_prompt.c_str(),
-                              params.clip_skip,
-                              params.cfg_scale,
-                              params.guidance,
-                              params.eta,
-                              params.width,
-                              params.height,
-                              params.sample_method,
-                              params.sample_steps,
-                              params.strength,
-                              params.seed,
-                              params.batch_count,
-                              control_image,
-                              params.control_strength,
-                              params.style_ratio,
-                              params.normalize_input,
-                              params.input_id_images_path.c_str(),
-                              params.skip_layers.data(),
-                              params.skip_layers.size(),
-                              params.slg_scale,
-                              params.skip_layer_start,
-                              params.skip_layer_end);
-        }
-    }
-
-    if (results == NULL) {
-        printf("generate failed\n");
-        free_sd_ctx(sd_ctx);
-        return 1;
-    }
-
-    int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
-    if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
-        upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
-                                                        params.n_threads);
-
-        if (upscaler_ctx == NULL) {
-            printf("new_upscaler_ctx failed\n");
-        } else {
-            for (int i = 0; i < params.batch_count; i++) {
-                if (results[i].data == NULL) {
-                    continue;
-                }
-                sd_image_t current_image = results[i];
-                for (int u = 0; u < params.upscale_repeats; ++u) {
-                    sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor);
-                    if (upscaled_image.data == NULL) {
-                        printf("upscale failed\n");
-                        break;
-                    }
-                    free(current_image.data);
-                    current_image = upscaled_image;
-                }
-                results[i] = current_image;  // Set the final upscaled image as the result
-            }
-        }
-    }
-
-    std::string dummy_name, ext, lc_ext;
-    bool is_jpg;
-    size_t last      = params.output_path.find_last_of(".");
-    size_t last_path = std::min(params.output_path.find_last_of("/"),
-                                params.output_path.find_last_of("\\"));
-    if (last != std::string::npos  // filename has extension
-        && (last_path == std::string::npos || last > last_path)) {
-        dummy_name = params.output_path.substr(0, last);
-        ext = lc_ext = params.output_path.substr(last);
-        std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower);
-        is_jpg = lc_ext == ".jpg" || lc_ext == ".jpeg" || lc_ext == ".jpe";
-    } else {
-        dummy_name = params.output_path;
-        ext = lc_ext = "";
-        is_jpg       = false;
-    }
-    // appending ".png" to absent or unknown extension
-    if (!is_jpg && lc_ext != ".png") {
-        dummy_name += ext;
-        ext = ".png";
-    }
-    for (int i = 0; i < params.batch_count; i++) {
-        if (results[i].data == NULL) {
-            continue;
-        }
-        std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
-        if (is_jpg) {
-            stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
-                           results[i].data, 90, get_image_params(params, params.seed + i).c_str());
-            printf("save result JPEG image to '%s'\n", final_image_path.c_str());
-        } else {
-            stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
-                           results[i].data, 0, get_image_params(params, params.seed + i).c_str());
-            printf("save result PNG image to '%s'\n", final_image_path.c_str());
-        }
-        free(results[i].data);
-        results[i].data = NULL;
-    }
-    g_collector.save_imatrix();
-    free(results);
-    free_sd_ctx(sd_ctx);
-    free(control_image_buffer);
-    free(input_image_buffer);
-
-    return 0;
-}
diff --git a/imatrix.hpp b/imatrix.hpp
index 946814a51..3b0ddd86b 100644
--- a/imatrix.hpp
+++ b/imatrix.hpp
@@ -1,3 +1,6 @@
+#ifndef IMATRIX_HPP
+#define IMATRIX_HPP
+
 #include "ggml-backend.h"
 #include "ggml.h"
 #include "util.h"
@@ -8,6 +11,7 @@
 
 /*Stolen from llama.cpp (credits: Kawrakow)*/
 
+
 struct Stats {
     std::vector<float> values{};
     std::vector<int> counts{};
@@ -17,20 +21,26 @@ struct Stats {
 class IMatrixCollector {
 public:
     IMatrixCollector() = default;
-    void set_params(SDParams params) { m_params = std::move(params); }
     bool collect_imatrix(struct ggml_tensor* t, bool ask, void* user_data);
-    void save_imatrix(int ncall = -1) const;
+    void save_imatrix(std::string fname, int ncall = -1) const;
     bool load_imatrix(const char* fname);
-
+    std::vector<float> get_values(const std::string& key) const {
+        auto it = m_stats.find(key);
+        if (it != m_stats.end()) {
+            return it->second.values;
+        } else {
+            return {};
+        }
+    }
 private:
     std::unordered_map<std::string, Stats> m_stats = {};
-    SDParams m_params;
     std::mutex m_mutex;
     int m_last_call = 0;
     std::vector<float> m_src1_data;
     std::vector<char> m_ids;  // the expert ids from ggml_mul_mat_id
 };
 
+#ifdef IMATRIX_IMPL
 // remove any prefix and suffixes from the name
 // CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
 static std::string filter_tensor_name(const char* name) {
@@ -141,15 +151,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* us
                     }
                 }
             }
-            // if (e.ncall > m_last_call) {
-            //     m_last_call = e.ncall;
-            //     if (m_last_call % m_params.n_out_freq == 0) {
-            //         save_imatrix();
-            //     }
-            //     if (m_params.n_save_freq > 0 && m_last_call % m_params.n_save_freq == 0) {
-            //         save_imatrix(m_last_call);
-            //     }
-            // }
         }
     } else {
         auto& e = m_stats[wname];
@@ -174,24 +175,13 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* us
                 }
             }
         }
-
-        // if (e.ncall > m_last_call) {
-        //     m_last_call = e.ncall;
-        //     if (m_last_call % m_params.n_out_freq == 0 && m_last_call > 0) {
-        //         save_imatrix();
-        //     }
-        //     if (m_params.n_save_freq > 0 && m_last_call % m_params.n_save_freq == 0 && m_last_call > 0) {
-        //         save_imatrix(m_last_call);
-        //     }
-        // }
     }
     return true;
 
 }
 
-void IMatrixCollector::save_imatrix(int ncall) const {
+void IMatrixCollector::save_imatrix(std::string fname,int ncall) const {
     LOG_INFO("SAVING_IMATRIX...");
-    auto fname = m_params.out_file;
 
     if (ncall > 0) {
         fname += ".at_";
@@ -322,4 +312,7 @@ bool IMatrixCollector::load_imatrix(const char* fname) {
         e.ncall += ncall;
     }
     return true;
-}
\ No newline at end of file
+}
+
+#endif
+#endif
\ No newline at end of file
diff --git a/model.cpp b/model.cpp
index 29d39e300..79af97a83 100644
--- a/model.cpp
+++ b/model.cpp
@@ -16,6 +16,7 @@
 #include "ggml-cpu.h"
 #include "ggml.h"
 
+#include "imatrix.hpp"
 #include "stable-diffusion.h"
 
 #ifdef SD_USE_METAL
@@ -28,6 +29,8 @@
 
 #define ST_HEADER_SIZE_LEN 8
 
+static IMatrixCollector* imatrix_collector = NULL;
+
 uint64_t read_u64(uint8_t* buffer) {
     // little endian
     uint64_t value = 0;
@@ -1714,7 +1717,7 @@ std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& v
     return res;
 }
 
-bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend,std::unordered_map<std::string, std::vector<float>> imatrix_data) {
+bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) {
     std::vector<TensorStorage> processed_tensor_storages;
     for (auto& tensor_storage : tensor_storages) {
         // LOG_DEBUG("%s", name.c_str());
@@ -1839,10 +1842,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
 
                     auto processed_name = convert_tensor_name(tensor_storage.name);
                     // LOG_DEBUG("%s",processed_name.c_str());
-                    std::vector<float> imatrix = imatrix_data[processed_name];
+                    std::vector<float> imatrix = imatrix_collector ? imatrix_collector->get_values(processed_name) : std::vector<float>{};
 
                     convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
-                                   dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0],imatrix);
+                                   dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0], imatrix);
                 }
             } else {
                 read_buffer.resize(tensor_storage.nbytes());
@@ -1866,7 +1869,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                     // convert first, then copy to device memory
                     auto processed_name = convert_tensor_name(tensor_storage.name);
                     // LOG_DEBUG("%s",processed_name.c_str());
-                    std::vector<float> imatrix = imatrix_data[processed_name];
+                    std::vector<float> imatrix = imatrix_collector ? imatrix_collector->get_values(processed_name) : std::vector<float>{};
 
                     convert_buffer.resize(ggml_nbytes(dst_tensor));
                     convert_tensor((void*)read_buffer.data(), tensor_storage.type,
@@ -1932,7 +1935,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
         return true;
     };
 
-    bool success = load_tensors(on_new_tensor_cb, backend, {});
+    bool success = load_tensors(on_new_tensor_cb, backend);
     if (!success) {
         LOG_ERROR("load tensors from file failed");
         return false;
@@ -1992,7 +1995,7 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
     return false;
 }
 
-bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, std::unordered_map<std::string, std::vector<float>> imatrix_data) {
+bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
     auto backend    = ggml_backend_cpu_init();
     size_t mem_size = 1 * 1024 * 1024;  // for padding
     mem_size += tensor_storages.size() * ggml_tensor_overhead();
@@ -2030,7 +2033,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
         return true;
     };
 
-    bool success = load_tensors(on_new_tensor_cb, backend, imatrix_data);
+    bool success = load_tensors(on_new_tensor_cb, backend);
     ggml_backend_free(backend);
     LOG_INFO("load tensors done");
     LOG_INFO("trying to save tensors to %s", file_path.c_str());
@@ -2066,54 +2069,11 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
     return mem_size;
 }
 
-static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
-    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
-    if (!in) {
-        LOG_ERROR("%s: failed to open %s\n", imatrix_file.c_str());
-        exit(1);
-    }
-    int n_entries;
-    in.read((char*)&n_entries, sizeof(n_entries));
-    if (in.fail() || n_entries < 1) {
-        LOG_ERROR("%s: no data in file %s\n", imatrix_file.c_str());
-        exit(1);
-    }
-    for (int i = 0; i < n_entries; ++i) {
-        int len;
-        in.read((char*)&len, sizeof(len));
-        std::vector<char> name_as_vec(len + 1);
-        in.read((char*)name_as_vec.data(), len);
-        if (in.fail()) {
-            LOG_ERROR("%s: failed reading name for entry %d from %s\n", i + 1, imatrix_file.c_str());
-            exit(1);
-        }
-        name_as_vec[len] = 0;
-        std::string name{name_as_vec.data()};
-        auto& e = imatrix_data[name];
-        int ncall;
-        in.read((char*)&ncall, sizeof(ncall));
-        int nval;
-        in.read((char*)&nval, sizeof(nval));
-        if (in.fail() || nval < 1) {
-            LOG_ERROR("%s: failed reading number of values for entry %d\n", i);
-            imatrix_data = {};
-            exit(1);
-        }
-        e.resize(nval);
-        in.read((char*)e.data(), nval * sizeof(float));
-        if (in.fail()) {
-            LOG_ERROR("%s: failed reading data for entry %d\n", i);
-            imatrix_data = {};
-            exit(1);
-        }
-        if (ncall > 0) {
-            for (auto& v : e)
-                v /= ncall;
-        }
-    }
+void setConvertImatrixCollector(void* collector) {
+    imatrix_collector = ((IMatrixCollector*)collector);
 }
 
-bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, const char* imatrix_path = NULL) {
+bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
     ModelLoader model_loader;
 
     if (!model_loader.init_from_file(input_path)) {
@@ -2128,16 +2088,6 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
         }
     }
 
-    std::unordered_map<std::string, std::vector<float>> imatrix_data = {};
-
-    if(imatrix_path){
-        load_imatrix(imatrix_path, imatrix_data);
-    }
-
-    // for (const auto& pair : imatrix_data) {
-    //     LOG_DEBUG("imatrix key : %s", pair.first.c_str());
-    // }
-
-    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, imatrix_data);
+    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type);
     return success;
 }
diff --git a/model.h b/model.h
index f8179643c..d7f976533 100644
--- a/model.h
+++ b/model.h
@@ -216,12 +216,12 @@ class ModelLoader {
     ggml_type get_diffusion_model_wtype();
     ggml_type get_vae_wtype();
     void set_wtype_override(ggml_type wtype, std::string prefix = "");
-    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend, std::unordered_map<std::string, std::vector<float>> imatrix_data = {});
+    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
     bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                       ggml_backend_t backend,
                       std::set<std::string> ignore_tensors = {});
 
-    bool save_to_gguf_file(const std::string& file_path, ggml_type type, std::unordered_map<std::string, std::vector<float>>);
+    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
     bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
     int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
     ~ModelLoader() = default;
diff --git a/stable-diffusion.h b/stable-diffusion.h
index d5a2b2f48..50799e45d 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -230,7 +230,8 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
 
-SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, const char* imatrix_path);
+SD_API void setConvertImatrixCollector(void * collector);
+SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type);
 
 SD_API uint8_t* preprocess_canny(uint8_t* img,
                                  int width,

From b5ecf2c046bbf770cb92e2db4b8cc96b485b13e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 23 Mar 2025 20:28:04 +0100
Subject: [PATCH 03/15] do not use logger in imatrix.hpp

---
 imatrix.hpp | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/imatrix.hpp b/imatrix.hpp
index 3b0ddd86b..a1c0f55c8 100644
--- a/imatrix.hpp
+++ b/imatrix.hpp
@@ -8,6 +8,7 @@
 #include <fstream>
 #include <mutex>
 #include <unordered_map>
+#include <string>
 
 /*Stolen from llama.cpp (credits: Kawrakow)*/
 
@@ -119,10 +120,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* us
             e.values.resize(src1->ne[0] * n_as, 0);
             e.counts.resize(src1->ne[0] * n_as, 0);
         } else if (e.values.size() != (size_t)src1->ne[0] * n_as) {
-            LOG_ERROR("inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0] * n_as);
+            printf("ERROR: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0] * n_as);
             exit(1);  // GGML_ABORT("fatal error");
         }
-        LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+        // LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
         // loop over all possible experts, regardless if they are used or not in the batch
         for (int ex = 0; ex < n_as; ++ex) {
             size_t e_start = ex * src1->ne[0];
@@ -144,8 +145,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* us
                         e.values[e_start + j] += x[j] * x[j];
                         e.counts[e_start + j]++;
                         if (!std::isfinite(e.values[e_start + j])) {
-                            LOG_INFO("\n");
-                            LOG_ERROR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                            printf("\n");
+                            printf("%ERROR: f detected in %s\n", e.values[e_start + j], wname.c_str());
                             exit(1);
                         }
                     }
@@ -158,7 +159,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* us
             e.values.resize(src1->ne[0], 0);
             e.counts.resize(src1->ne[0], 0);
         } else if (e.values.size() != (size_t)src1->ne[0]) {
-            LOG_ERROR("inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+            printf("inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
             exit(1);  // GGML_ABORT("fatal error");
         }
 
@@ -170,7 +171,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* us
                 e.values[j] += x[j] * x[j];
                 e.counts[j]++;
                 if (!std::isfinite(e.values[j])) {
-                    LOG_ERROR("%f detected in %s\n", e.values[j], wname.c_str());
+                    printf("%f detected in %s\n", e.values[j], wname.c_str());
                     exit(1);
                 }
             }
@@ -181,7 +182,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* us
 }
 
 void IMatrixCollector::save_imatrix(std::string fname,int ncall) const {
-    LOG_INFO("SAVING_IMATRIX...");
+    printf("SAVING_IMATRIX...\n");
 
     if (ncall > 0) {
         fname += ".at_";
@@ -209,17 +210,17 @@ void IMatrixCollector::save_imatrix(std::string fname,int ncall) const {
         }
 
         if (n_zeros != 0 && is_first) {
-            LOG_INFO("\n");
+            printf("\n");
             is_first = false;
         }
 
         if (n_zeros == n_all) {
-            LOG_WARN("entry '%40s' has no data - skipping\n", kv.first.c_str());
+            printf("WARNING: entry '%40s' has no data - skipping\n", kv.first.c_str());
             continue;
         }
 
         if (n_zeros > 0) {
-            LOG_WARN("entry '%40s' has partial data (%.2f%%) - skipping\n", kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            printf("WARNING: entry '%40s' has partial data (%.2f%%) - skipping\n", kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
             continue;
         }
 
@@ -228,7 +229,7 @@ void IMatrixCollector::save_imatrix(std::string fname,int ncall) const {
     }
 
     if (to_store.size() < m_stats.size()) {
-        LOG_WARN("storing only %zu out of %zu entries\n", to_store.size(), m_stats.size());
+        printf("WARNING: storing only %zu out of %zu entries\n", to_store.size(), m_stats.size());
     }
 
     std::ofstream out(fname, std::ios::binary);
@@ -253,20 +254,20 @@ void IMatrixCollector::save_imatrix(std::string fname,int ncall) const {
     // Write the number of call the matrix was computed with
     out.write((const char*)&m_last_call, sizeof(m_last_call));
 
-    LOG_DEBUG("\n");
-    LOG_DEBUG("stored collected data after %d chunks in %s\n", m_last_call, fname.c_str());
+    // LOG_DEBUG("\n");
+    // LOG_DEBUG("stored collected data after %d chunks in %s\n", m_last_call, fname.c_str());
 }
 
 bool IMatrixCollector::load_imatrix(const char* fname) {
     std::ifstream in(fname, std::ios::binary);
     if (!in) {
-        LOG_ERROR("failed to open %s\n", fname);
+        printf("ERROR: failed to open %s\n", fname);
         return false;
     }
     int n_entries;
     in.read((char*)&n_entries, sizeof(n_entries));
     if (in.fail() || n_entries < 1) {
-        LOG_ERROR("no data in file %s\n", fname);
+        printf("ERROR: no data in file %s\n", fname);
         return false;
     }
     for (int i = 0; i < n_entries; ++i) {
@@ -275,7 +276,7 @@ bool IMatrixCollector::load_imatrix(const char* fname) {
         std::vector<char> name_as_vec(len + 1);
         in.read((char*)name_as_vec.data(), len);
         if (in.fail()) {
-            LOG_ERROR("failed reading name for entry %d from %s\n", i + 1, fname);
+            printf("ERROR: failed reading name for entry %d from %s\n", i + 1, fname);
             return false;
         }
         name_as_vec[len] = 0;
@@ -286,7 +287,7 @@ bool IMatrixCollector::load_imatrix(const char* fname) {
         int nval;
         in.read((char*)&nval, sizeof(nval));
         if (in.fail() || nval < 1) {
-            LOG_ERROR("failed reading number of values for entry %d\n", i);
+            printf("ERROR: failed reading number of values for entry %d\n", i);
             m_stats = {};
             return false;
         }
@@ -299,7 +300,7 @@ bool IMatrixCollector::load_imatrix(const char* fname) {
         std::vector<float> tmp(nval);
         in.read((char*)tmp.data(), nval * sizeof(float));
         if (in.fail()) {
-            LOG_ERROR("failed reading data for entry %d\n", i);
+            printf("ERROR: failed reading data for entry %d\n", i);
             m_stats = {};
             return false;
         }

From 134c2351cce32978a0ab837ad27ffcde3b14ba75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Mon, 24 Mar 2025 02:55:26 +0100
Subject: [PATCH 04/15] imatrix: support DiT text encoders

---
 imatrix.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/imatrix.hpp b/imatrix.hpp
index a1c0f55c8..8cdc64b4a 100644
--- a/imatrix.hpp
+++ b/imatrix.hpp
@@ -76,7 +76,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* us
             return false;
         // why are small batches ignored (<16 tokens)?
         // if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
-        if (!(wname.substr(0, 6) == "model." || wname.substr(0, 17) == "cond_stage_model."))
+        if (!(wname.substr(0, 6) == "model." || wname.substr(0, 17) == "cond_stage_model." || wname.substr(0,14) == "text_encoders."))
             return false;
         return true;
     }

From a6089d557e884ee2fe441f382f0174204a0f5809 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 28 Mar 2025 12:32:32 +0100
Subject: [PATCH 05/15] Model: merge split models when converting

---
 examples/cli/main.cpp |  2 +-
 model.cpp             | 37 +++++++++++++++++++++++++++++++++----
 stable-diffusion.h    |  2 +-
 3 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index d2f407562..097b6e568 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -841,7 +841,7 @@ int main(int argc, const char* argv[]) {
     }
 
     if (params.mode == CONVERT) {
-        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype);
+        bool success = convert(params.model_path.c_str(), params.clip_l_path.c_str(), params.clip_g_path.c_str(), params.t5xxl_path.c_str(), params.diffusion_model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype);
         if (!success) {
             fprintf(stderr,
                     "convert '%s'/'%s' to '%s' failed\n",
diff --git a/model.cpp b/model.cpp
index 79af97a83..4cd3a9e9a 100644
--- a/model.cpp
+++ b/model.cpp
@@ -2073,12 +2073,41 @@ void setConvertImatrixCollector(void* collector) {
     imatrix_collector = ((IMatrixCollector*)collector);
 }
 
-bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
+bool convert(const char* model_path, const char* clip_l_path, const char* clip_g_path, const char* t5xxl_path, const char* diffusion_model_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
     ModelLoader model_loader;
 
-    if (!model_loader.init_from_file(input_path)) {
-        LOG_ERROR("init model loader from file failed: '%s'", input_path);
-        return false;
+    if (model_path != NULL && strlen(model_path) > 0) {
+        if (!model_loader.init_from_file(model_path)) {
+            LOG_ERROR("init model loader from file failed: '%s'", model_path);
+            return false;
+        }
+    }
+
+    if (clip_l_path != NULL && strlen(clip_l_path) > 0) {
+        if (!model_loader.init_from_file(clip_l_path, "text_encoders.clip_l.transformer.")) {
+            LOG_ERROR("init model loader from file failed: '%s'", clip_l_path);
+            return false;
+        }
+    }
+
+    if (clip_g_path != NULL && strlen(clip_g_path) > 0) {
+        if (!model_loader.init_from_file(clip_g_path, "text_encoders.clip_g.transformer.")) {
+            LOG_ERROR("init model loader from file failed: '%s'", clip_g_path);
+            return false;
+        }
+    }
+    if (t5xxl_path != NULL && strlen(t5xxl_path) > 0) {
+        if (!model_loader.init_from_file(t5xxl_path, "text_encoders.t5xxl.transformer.")) {
+            LOG_ERROR("init model loader from file failed: '%s'", t5xxl_path);
+            return false;
+        }
+    }
+
+    if (diffusion_model_path != NULL && strlen(diffusion_model_path) > 0) {
+        if (!model_loader.init_from_file(diffusion_model_path, "model.diffusion_model.")) {
+            LOG_ERROR("init model loader from file failed: '%s'", diffusion_model_path);
+            return false;
+        }
     }
 
     if (vae_path != NULL && strlen(vae_path) > 0) {
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 50799e45d..8010361a3 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -231,7 +231,7 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
 
 SD_API void setConvertImatrixCollector(void * collector);
-SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type);
+SD_API bool convert(const char* model_path, const char* clip_l_path, const char* clip_g_path, const char* t5xxl_path, const char* diffusion_model_path, const char* vae_path, const char* output_path, enum sd_type_t output_type);
 
 SD_API uint8_t* preprocess_canny(uint8_t* img,
                                  int width,

From f719cb90c428656cee3b337eb21008130c137ec6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 29 Mar 2025 03:31:19 +0100
Subject: [PATCH 06/15] Do not warn about imatrix requirement if imatrix is
 provided

---
 examples/cli/main.cpp | 56 +++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 097b6e568..5f98d6cad 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -262,6 +262,7 @@ void print_usage(int argc, const char* argv[]) {
 void parse_args(int argc, const char** argv, SDParams& params) {
     bool invalid_arg = false;
     std::string arg;
+    std::string type = "";
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
 
@@ -367,32 +368,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 invalid_arg = true;
                 break;
             }
-            std::string type        = argv[i];
-            bool found              = false;
-            std::string valid_types = "";
-            for (size_t i = 0; i < SD_TYPE_COUNT; i++) {
-                auto trait = ggml_get_type_traits((ggml_type)i);
-                std::string name(trait->type_name);
-                if (name == "f32" || trait->to_float && trait->type_size) {
-                    if (i)
-                        valid_types += ", ";
-                    valid_types += name;
-                    if (type == name) {
-                        if (ggml_quantize_requires_imatrix((ggml_type)i)) {
-                            printf("\033[35;1m[WARNING]\033[0m: type %s requires imatrix to work properly. A dummy imatrix will be used, expect poor quality.\n", trait->type_name);
-                        }
-                        params.wtype = (enum sd_type_t)i;
-                        found        = true;
-                        break;
-                    }
-                }
-            }
-            if (!found) {
-                fprintf(stderr, "error: invalid weight format %s, must be one of [%s]\n",
-                        type.c_str(),
-                        valid_types.c_str());
-                exit(1);
-            }
+            type = argv[i];
         } else if (arg == "--lora-model-dir") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -659,6 +635,34 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             exit(1);
         }
     }
+    if (type != "") {
+        bool found              = false;
+        std::string valid_types = "";
+        for (size_t i = 0; i < SD_TYPE_COUNT; i++) {
+            auto trait = ggml_get_type_traits((ggml_type)i);
+            std::string name(trait->type_name);
+            if (name == "f32" || trait->to_float && trait->type_size) {
+                if (i)
+                    valid_types += ", ";
+                valid_types += name;
+                if (type == name) {
+                    if (ggml_quantize_requires_imatrix((ggml_type)i) && params.imatrix_in.size() == 0) {
+                        printf("\033[35;1m[WARNING]\033[0m: type %s requires imatrix to work properly. A dummy imatrix will be used, expect poor quality.\n", trait->type_name);
+                    }
+                    params.wtype = (enum sd_type_t)i;
+                    found        = true;
+                    break;
+                }
+            }
+        }
+        if (!found) {
+            fprintf(stderr, "error: invalid weight format %s, must be one of [%s]\n",
+                    type.c_str(),
+                    valid_types.c_str());
+            exit(1);
+        }
+    }
+
     if (invalid_arg) {
         fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
         print_usage(argc, argv);

From ea6a54392359337a1d3c9b86bd176228fd149d02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 29 Mar 2025 18:36:32 +0100
Subject: [PATCH 07/15] Make imatrix not a header-only lib

---
 imatrix.cpp | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 imatrix.hpp | 281 ---------------------------------------------------
 2 files changed, 285 insertions(+), 281 deletions(-)
 create mode 100644 imatrix.cpp

diff --git a/imatrix.cpp b/imatrix.cpp
new file mode 100644
index 000000000..7c73f1192
--- /dev/null
+++ b/imatrix.cpp
@@ -0,0 +1,285 @@
+#include "imatrix.hpp"
+
+/*Stolen from llama.cpp (credits: Kawrakow)*/
+
+#include "ggml-backend.h"
+#include "ggml.h"
+#include "util.h"
+
+#include <fstream>
+#include <mutex>
+#include <unordered_map>
+#include <string>
+
+// remove any prefix and suffixes from the name
+// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
+static std::string filter_tensor_name(const char* name) {
+    std::string wname;
+    const char* p = strchr(name, '#');
+    if (p != NULL) {
+        p             = p + 1;
+        const char* q = strchr(p, '#');
+        if (q != NULL) {
+            wname = std::string(p, q - p);
+        } else {
+            wname = p;
+        }
+    } else {
+        wname = name;
+    }
+    return wname;
+}
+
+bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* user_data) {
+    GGML_UNUSED(user_data);
+    const struct ggml_tensor* src0 = t->src[0];
+    const struct ggml_tensor* src1 = t->src[1];
+    std::string wname              = filter_tensor_name(src0->name);
+
+    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
+    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
+    if (ask) {
+        if (t->op == GGML_OP_MUL_MAT_ID)
+            return true;  // collect all indirect matrix multiplications
+        if (t->op != GGML_OP_MUL_MAT)
+            return false;
+        // why are small batches ignored (<16 tokens)?
+        // if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
+        if (!(wname.substr(0, 6) == "model." || wname.substr(0, 17) == "cond_stage_model." || wname.substr(0,14) == "text_encoders."))
+            return false;
+        return true;
+    }
+    // LOG_DEBUG("%s", wname.c_str());
+
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    // copy the data from the GPU memory if needed
+    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
+
+    if (!is_host) {
+        m_src1_data.resize(ggml_nelements(src1));
+        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
+    }
+
+    const float* data = is_host ? (const float*)src1->data : m_src1_data.data();
+
+    // this has been adapted to the new format of storing merged experts in a single 3d tensor
+    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
+    if (t->op == GGML_OP_MUL_MAT_ID) {
+        //   ids  -> [n_experts_used, n_tokens]
+        //   src1 -> [cols, n_expert_used, n_tokens]
+        const ggml_tensor* ids = t->src[2];
+        const int n_as         = src0->ne[2];
+        const int n_ids        = ids->ne[0];
+
+        // the top-k selected expert ids are stored in the ids tensor
+        // for simplicity, always copy ids to host, because it is small
+        // take into account that ids is not contiguous!
+
+        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
+
+        m_ids.resize(ggml_nbytes(ids));
+        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
+
+        auto& e = m_stats[wname];
+
+        ++e.ncall;
+
+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0] * n_as, 0);
+            e.counts.resize(src1->ne[0] * n_as, 0);
+        } else if (e.values.size() != (size_t)src1->ne[0] * n_as) {
+            LOG_ERROR("inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0] * n_as);
+            exit(1);  // GGML_ABORT("fatal error");
+        }
+        // LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+        // loop over all possible experts, regardless if they are used or not in the batch
+        for (int ex = 0; ex < n_as; ++ex) {
+            size_t e_start = ex * src1->ne[0];
+
+            for (int idx = 0; idx < n_ids; ++idx) {
+                for (int row = 0; row < (int)src1->ne[2]; ++row) {
+                    const int excur = *(const int32_t*)(m_ids.data() + row * ids->nb[1] + idx * ids->nb[0]);
+
+                    GGML_ASSERT(excur >= 0 && excur < n_as);  // sanity check
+
+                    if (excur != ex)
+                        continue;
+
+                    const int64_t i11 = idx % src1->ne[1];
+                    const int64_t i12 = row;
+                    const float* x    = (const float*)((const char*)data + i11 * src1->nb[1] + i12 * src1->nb[2]);
+
+                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                        e.values[e_start + j] += x[j] * x[j];
+                        e.counts[e_start + j]++;
+                        if (!std::isfinite(e.values[e_start + j])) {
+                            printf("\n");
+                            LOG_ERROR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                            exit(1);
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        auto& e = m_stats[wname];
+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0], 0);
+            e.counts.resize(src1->ne[0], 0);
+        } else if (e.values.size() != (size_t)src1->ne[0]) {
+            LOG_WARN("inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+            exit(1);  // GGML_ABORT("fatal error");
+        }
+
+        ++e.ncall;
+        // LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+        for (int row = 0; row < (int)src1->ne[1]; ++row) {
+            const float* x = data + row * src1->ne[0];
+            for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                e.values[j] += x[j] * x[j];
+                e.counts[j]++;
+                if (!std::isfinite(e.values[j])) {
+                    LOG_WARN("%f detected in %s\n", e.values[j], wname.c_str());
+                    exit(1);
+                }
+            }
+        }
+    }
+    return true;
+
+}
+
+void IMatrixCollector::save_imatrix(std::string fname,int ncall) const {
+    LOG_INFO("SAVING_IMATRIX to %s\n", fname.c_str());
+
+    if (ncall > 0) {
+        fname += ".at_";
+        fname += std::to_string(ncall);
+    }
+    // avoid writing imatrix entries that do not have full data
+    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
+
+    int n_entries = 0;
+    std::vector<std::string> to_store;
+
+    bool is_first = true;  // for printing
+    for (const auto& kv : m_stats) {
+        const int n_all = kv.second.counts.size();
+
+        if (n_all == 0) {
+            continue;
+        }
+
+        int n_zeros = 0;
+        for (const int c : kv.second.counts) {
+            if (c == 0) {
+                n_zeros++;
+            }
+        }
+
+        if (n_zeros != 0 && is_first) {
+            printf("\n");
+            is_first = false;
+        }
+
+        if (n_zeros == n_all) {
+            LOG_WARN("entry '%40s' has no data - skipping\n", kv.first.c_str());
+            continue;
+        }
+
+        if (n_zeros > 0) {
+            LOG_WARN("entry '%40s' has partial data (%.2f%%) - skipping\n", kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            continue;
+        }
+
+        n_entries++;
+        to_store.push_back(kv.first);
+    }
+
+    if (to_store.size() < m_stats.size()) {
+        LOG_WARN("storing only %zu out of %zu entries\n", to_store.size(), m_stats.size());
+    }
+
+    std::ofstream out(fname, std::ios::binary);
+    out.write((const char*)&n_entries, sizeof(n_entries));
+    for (const auto& name : to_store) {
+        const auto& stat = m_stats.at(name);
+        int len          = name.size();
+        out.write((const char*)&len, sizeof(len));
+        out.write(name.c_str(), len);
+        out.write((const char*)&stat.ncall, sizeof(stat.ncall));
+        int nval = stat.values.size();
+        out.write((const char*)&nval, sizeof(nval));
+        if (nval > 0) {
+            std::vector<float> tmp(nval);
+            for (int i = 0; i < nval; i++) {
+                tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
+            }
+            out.write((const char*)tmp.data(), nval * sizeof(float));
+        }
+    }
+
+    // Write the number of call the matrix was computed with
+    out.write((const char*)&m_last_call, sizeof(m_last_call));
+
+    // LOG_DEBUG("\n");
+    // LOG_DEBUG("stored collected data after %d chunks in %s\n", m_last_call, fname.c_str());
+}
+
+bool IMatrixCollector::load_imatrix(const char* fname) {
+    std::ifstream in(fname, std::ios::binary);
+    if (!in) {
+        LOG_ERROR("failed to open %s\n", fname);
+        return false;
+    }
+    int n_entries;
+    in.read((char*)&n_entries, sizeof(n_entries));
+    if (in.fail() || n_entries < 1) {
+        LOG_ERROR("no data in file %s\n", fname);
+        return false;
+    }
+    for (int i = 0; i < n_entries; ++i) {
+        int len;
+        in.read((char*)&len, sizeof(len));
+        std::vector<char> name_as_vec(len + 1);
+        in.read((char*)name_as_vec.data(), len);
+        if (in.fail()) {
+            LOG_ERROR("failed reading name for entry %d from %s\n", i + 1, fname);
+            return false;
+        }
+        name_as_vec[len] = 0;
+        std::string name{name_as_vec.data()};
+        auto& e = m_stats[std::move(name)];
+        int ncall;
+        in.read((char*)&ncall, sizeof(ncall));
+        int nval;
+        in.read((char*)&nval, sizeof(nval));
+        if (in.fail() || nval < 1) {
+            LOG_ERROR("failed reading number of values for entry %d\n", i);
+            m_stats = {};
+            return false;
+        }
+
+        if (e.values.empty()) {
+            e.values.resize(nval, 0);
+            e.counts.resize(nval, 0);
+        }
+
+        std::vector<float> tmp(nval);
+        in.read((char*)tmp.data(), nval * sizeof(float));
+        if (in.fail()) {
+            LOG_ERROR("failed reading data for entry %d\n", i);
+            m_stats = {};
+            return false;
+        }
+
+        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
+        for (int i = 0; i < nval; i++) {
+            e.values[i] += tmp[i];
+            e.counts[i] += ncall;
+        }
+        e.ncall += ncall;
+    }
+    return true;
+}
\ No newline at end of file
diff --git a/imatrix.hpp b/imatrix.hpp
index 8cdc64b4a..36e4932a8 100644
--- a/imatrix.hpp
+++ b/imatrix.hpp
@@ -1,10 +1,5 @@
 #ifndef IMATRIX_HPP
 #define IMATRIX_HPP
-
-#include "ggml-backend.h"
-#include "ggml.h"
-#include "util.h"
-
 #include <fstream>
 #include <mutex>
 #include <unordered_map>
@@ -12,7 +7,6 @@
 
 /*Stolen from llama.cpp (credits: Kawrakow)*/
 
-
 struct Stats {
     std::vector<float> values{};
     std::vector<int> counts{};
@@ -41,279 +35,4 @@ class IMatrixCollector {
     std::vector<char> m_ids;  // the expert ids from ggml_mul_mat_id
 };
 
-#ifdef IMATRIX_IMPL
-// remove any prefix and suffixes from the name
-// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
-static std::string filter_tensor_name(const char* name) {
-    std::string wname;
-    const char* p = strchr(name, '#');
-    if (p != NULL) {
-        p             = p + 1;
-        const char* q = strchr(p, '#');
-        if (q != NULL) {
-            wname = std::string(p, q - p);
-        } else {
-            wname = p;
-        }
-    } else {
-        wname = name;
-    }
-    return wname;
-}
-
-bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* user_data) {
-    GGML_UNUSED(user_data);
-    const struct ggml_tensor* src0 = t->src[0];
-    const struct ggml_tensor* src1 = t->src[1];
-    std::string wname              = filter_tensor_name(src0->name);
-
-    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
-    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
-    if (ask) {
-        if (t->op == GGML_OP_MUL_MAT_ID)
-            return true;  // collect all indirect matrix multiplications
-        if (t->op != GGML_OP_MUL_MAT)
-            return false;
-        // why are small batches ignored (<16 tokens)?
-        // if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
-        if (!(wname.substr(0, 6) == "model." || wname.substr(0, 17) == "cond_stage_model." || wname.substr(0,14) == "text_encoders."))
-            return false;
-        return true;
-    }
-    // LOG_DEBUG("%s", wname.c_str());
-
-    std::lock_guard<std::mutex> lock(m_mutex);
-
-    // copy the data from the GPU memory if needed
-    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
-
-    if (!is_host) {
-        m_src1_data.resize(ggml_nelements(src1));
-        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
-    }
-
-    const float* data = is_host ? (const float*)src1->data : m_src1_data.data();
-
-    // this has been adapted to the new format of storing merged experts in a single 3d tensor
-    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
-    if (t->op == GGML_OP_MUL_MAT_ID) {
-        //   ids  -> [n_experts_used, n_tokens]
-        //   src1 -> [cols, n_expert_used, n_tokens]
-        const ggml_tensor* ids = t->src[2];
-        const int n_as         = src0->ne[2];
-        const int n_ids        = ids->ne[0];
-
-        // the top-k selected expert ids are stored in the ids tensor
-        // for simplicity, always copy ids to host, because it is small
-        // take into account that ids is not contiguous!
-
-        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
-
-        m_ids.resize(ggml_nbytes(ids));
-        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
-
-        auto& e = m_stats[wname];
-
-        ++e.ncall;
-
-        if (e.values.empty()) {
-            e.values.resize(src1->ne[0] * n_as, 0);
-            e.counts.resize(src1->ne[0] * n_as, 0);
-        } else if (e.values.size() != (size_t)src1->ne[0] * n_as) {
-            printf("ERROR: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0] * n_as);
-            exit(1);  // GGML_ABORT("fatal error");
-        }
-        // LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
-        // loop over all possible experts, regardless if they are used or not in the batch
-        for (int ex = 0; ex < n_as; ++ex) {
-            size_t e_start = ex * src1->ne[0];
-
-            for (int idx = 0; idx < n_ids; ++idx) {
-                for (int row = 0; row < (int)src1->ne[2]; ++row) {
-                    const int excur = *(const int32_t*)(m_ids.data() + row * ids->nb[1] + idx * ids->nb[0]);
-
-                    GGML_ASSERT(excur >= 0 && excur < n_as);  // sanity check
-
-                    if (excur != ex)
-                        continue;
-
-                    const int64_t i11 = idx % src1->ne[1];
-                    const int64_t i12 = row;
-                    const float* x    = (const float*)((const char*)data + i11 * src1->nb[1] + i12 * src1->nb[2]);
-
-                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                        e.values[e_start + j] += x[j] * x[j];
-                        e.counts[e_start + j]++;
-                        if (!std::isfinite(e.values[e_start + j])) {
-                            printf("\n");
-                            printf("%ERROR: f detected in %s\n", e.values[e_start + j], wname.c_str());
-                            exit(1);
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        auto& e = m_stats[wname];
-        if (e.values.empty()) {
-            e.values.resize(src1->ne[0], 0);
-            e.counts.resize(src1->ne[0], 0);
-        } else if (e.values.size() != (size_t)src1->ne[0]) {
-            printf("inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
-            exit(1);  // GGML_ABORT("fatal error");
-        }
-
-        ++e.ncall;
-        // LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
-        for (int row = 0; row < (int)src1->ne[1]; ++row) {
-            const float* x = data + row * src1->ne[0];
-            for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                e.values[j] += x[j] * x[j];
-                e.counts[j]++;
-                if (!std::isfinite(e.values[j])) {
-                    printf("%f detected in %s\n", e.values[j], wname.c_str());
-                    exit(1);
-                }
-            }
-        }
-    }
-    return true;
-
-}
-
-void IMatrixCollector::save_imatrix(std::string fname,int ncall) const {
-    printf("SAVING_IMATRIX...\n");
-
-    if (ncall > 0) {
-        fname += ".at_";
-        fname += std::to_string(ncall);
-    }
-    // avoid writing imatrix entries that do not have full data
-    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
-
-    int n_entries = 0;
-    std::vector<std::string> to_store;
-
-    bool is_first = true;  // for printing
-    for (const auto& kv : m_stats) {
-        const int n_all = kv.second.counts.size();
-
-        if (n_all == 0) {
-            continue;
-        }
-
-        int n_zeros = 0;
-        for (const int c : kv.second.counts) {
-            if (c == 0) {
-                n_zeros++;
-            }
-        }
-
-        if (n_zeros != 0 && is_first) {
-            printf("\n");
-            is_first = false;
-        }
-
-        if (n_zeros == n_all) {
-            printf("WARNING: entry '%40s' has no data - skipping\n", kv.first.c_str());
-            continue;
-        }
-
-        if (n_zeros > 0) {
-            printf("WARNING: entry '%40s' has partial data (%.2f%%) - skipping\n", kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
-            continue;
-        }
-
-        n_entries++;
-        to_store.push_back(kv.first);
-    }
-
-    if (to_store.size() < m_stats.size()) {
-        printf("WARNING: storing only %zu out of %zu entries\n", to_store.size(), m_stats.size());
-    }
-
-    std::ofstream out(fname, std::ios::binary);
-    out.write((const char*)&n_entries, sizeof(n_entries));
-    for (const auto& name : to_store) {
-        const auto& stat = m_stats.at(name);
-        int len          = name.size();
-        out.write((const char*)&len, sizeof(len));
-        out.write(name.c_str(), len);
-        out.write((const char*)&stat.ncall, sizeof(stat.ncall));
-        int nval = stat.values.size();
-        out.write((const char*)&nval, sizeof(nval));
-        if (nval > 0) {
-            std::vector<float> tmp(nval);
-            for (int i = 0; i < nval; i++) {
-                tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
-            }
-            out.write((const char*)tmp.data(), nval * sizeof(float));
-        }
-    }
-
-    // Write the number of call the matrix was computed with
-    out.write((const char*)&m_last_call, sizeof(m_last_call));
-
-    // LOG_DEBUG("\n");
-    // LOG_DEBUG("stored collected data after %d chunks in %s\n", m_last_call, fname.c_str());
-}
-
-bool IMatrixCollector::load_imatrix(const char* fname) {
-    std::ifstream in(fname, std::ios::binary);
-    if (!in) {
-        printf("ERROR: failed to open %s\n", fname);
-        return false;
-    }
-    int n_entries;
-    in.read((char*)&n_entries, sizeof(n_entries));
-    if (in.fail() || n_entries < 1) {
-        printf("ERROR: no data in file %s\n", fname);
-        return false;
-    }
-    for (int i = 0; i < n_entries; ++i) {
-        int len;
-        in.read((char*)&len, sizeof(len));
-        std::vector<char> name_as_vec(len + 1);
-        in.read((char*)name_as_vec.data(), len);
-        if (in.fail()) {
-            printf("ERROR: failed reading name for entry %d from %s\n", i + 1, fname);
-            return false;
-        }
-        name_as_vec[len] = 0;
-        std::string name{name_as_vec.data()};
-        auto& e = m_stats[std::move(name)];
-        int ncall;
-        in.read((char*)&ncall, sizeof(ncall));
-        int nval;
-        in.read((char*)&nval, sizeof(nval));
-        if (in.fail() || nval < 1) {
-            printf("ERROR: failed reading number of values for entry %d\n", i);
-            m_stats = {};
-            return false;
-        }
-
-        if (e.values.empty()) {
-            e.values.resize(nval, 0);
-            e.counts.resize(nval, 0);
-        }
-
-        std::vector<float> tmp(nval);
-        in.read((char*)tmp.data(), nval * sizeof(float));
-        if (in.fail()) {
-            printf("ERROR: failed reading data for entry %d\n", i);
-            m_stats = {};
-            return false;
-        }
-
-        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
-        for (int i = 0; i < nval; i++) {
-            e.values[i] += tmp[i];
-            e.counts[i] += ncall;
-        }
-        e.ncall += ncall;
-    }
-    return true;
-}
-
-#endif
 #endif
\ No newline at end of file

From 7f9fd3183cbccb71ece45d575939bf4da5f5220e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 29 Mar 2025 18:47:21 +0100
Subject: [PATCH 08/15] Warn user if imatrix will get overwritten

---
 examples/cli/main.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 5f98d6cad..3dc8dd53d 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -663,6 +663,14 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         }
     }
 
+    if (params.imatrix_out.size() > 0 && file_exists(params.imatrix_out)) {
+        // imatrix file already exists
+        if (std::find(params.imatrix_in.begin(), params.imatrix_in.end(), params.imatrix_out) == params.imatrix_in.end()) {
+            printf("\n IMPORTANT: imatrix file %s already exists, but wasn't found in the imatrix inputs.\n", params.imatrix_out.c_str());
+            printf("%s will get overwritten!\n", params.imatrix_out.c_str());
+        }
+    }
+
     if (invalid_arg) {
         fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
         print_usage(argc, argv);

From 91a7a669a53aa0ac4375b19c4893e45b5cae217a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 29 Mar 2025 19:00:07 +0100
Subject: [PATCH 09/15] Fix missing includes

---
 imatrix.cpp | 5 +----
 imatrix.hpp | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/imatrix.cpp b/imatrix.cpp
index 7c73f1192..5612483ed 100644
--- a/imatrix.cpp
+++ b/imatrix.cpp
@@ -6,10 +6,7 @@
 #include "ggml.h"
 #include "util.h"
 
-#include <fstream>
-#include <mutex>
-#include <unordered_map>
-#include <string>
+#include <cmath>
 
 // remove any prefix and suffixes from the name
 // CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
diff --git a/imatrix.hpp b/imatrix.hpp
index 36e4932a8..bcfd4e424 100644
--- a/imatrix.hpp
+++ b/imatrix.hpp
@@ -4,6 +4,7 @@
 #include <mutex>
 #include <unordered_map>
 #include <string>
+#include <vector>
 
 /*Stolen from llama.cpp (credits: Kawrakow)*/
 

From ae0fcc29603b93cc056d68b1a4969487b0552a1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 29 Mar 2025 20:00:56 +0100
Subject: [PATCH 10/15] Refactor imatrix api, fix build shared libs

---
 examples/cli/main.cpp | 17 ++++-------------
 model.cpp             | 26 +++++++++++++++++++-------
 stable-diffusion.h    |  6 +++++-
 3 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 3dc8dd53d..ac446f5d8 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -22,10 +22,6 @@
 #define STB_IMAGE_RESIZE_STATIC
 #include "stb_image_resize.h"
 
-#define IMATRIX_IMPL
-#include "imatrix.hpp"
-static IMatrixCollector g_collector;
-
 const char* rng_type_to_str[] = {
     "std_default",
     "cuda",
@@ -663,7 +659,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         }
     }
 
-    if (params.imatrix_out.size() > 0 && file_exists(params.imatrix_out)) {
+    if (params.imatrix_out.size() > 0 && std::ifstream(params.imatrix_out).good()) {
         // imatrix file already exists
         if (std::find(params.imatrix_in.begin(), params.imatrix_in.end(), params.imatrix_out) == params.imatrix_in.end()) {
             printf("\n IMPORTANT: imatrix file %s already exists, but wasn't found in the imatrix inputs.\n", params.imatrix_out.c_str());
@@ -823,10 +819,6 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
     fflush(out_stream);
 }
 
-static bool collect_imatrix(struct ggml_tensor* t, bool ask, void* user_data) {
-    return g_collector.collect_imatrix(t, ask, user_data);
-}
-
 int main(int argc, const char* argv[]) {
     SDParams params;
 
@@ -840,13 +832,12 @@ int main(int argc, const char* argv[]) {
     }
 
     if (params.imatrix_out != "") {
-        sd_set_backend_eval_callback((sd_graph_eval_callback_t)collect_imatrix, &params);
+        enableImatrixCollection();
     }
     if (params.imatrix_out != "" || params.mode == CONVERT || params.wtype != SD_TYPE_COUNT) {
-        setConvertImatrixCollector((void*)&g_collector);
         for (const auto& in_file : params.imatrix_in) {
             printf("loading imatrix from '%s'\n", in_file.c_str());
-            if (!g_collector.load_imatrix(in_file.c_str())) {
+            if (!loadImatrix(in_file.c_str())) {
                 printf("Failed to load %s\n", in_file.c_str());
             }
         }
@@ -1165,7 +1156,7 @@ int main(int argc, const char* argv[]) {
         results[i].data = NULL;
     }
     if (params.imatrix_out != "") {
-        g_collector.save_imatrix(params.imatrix_out);
+        saveImatrix(params.imatrix_out.c_str());
     }
     free(results);
     free_sd_ctx(sd_ctx);
diff --git a/model.cpp b/model.cpp
index 4cd3a9e9a..9672a978a 100644
--- a/model.cpp
+++ b/model.cpp
@@ -29,7 +29,7 @@
 
 #define ST_HEADER_SIZE_LEN 8
 
-static IMatrixCollector* imatrix_collector = NULL;
+static IMatrixCollector imatrix_collector;
 
 uint64_t read_u64(uint8_t* buffer) {
     // little endian
@@ -1842,7 +1842,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
 
                     auto processed_name = convert_tensor_name(tensor_storage.name);
                     // LOG_DEBUG("%s",processed_name.c_str());
-                    std::vector<float> imatrix = imatrix_collector ? imatrix_collector->get_values(processed_name) : std::vector<float>{};
+                    std::vector<float> imatrix = imatrix_collector.get_values(processed_name);
 
                     convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
                                    dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0], imatrix);
@@ -1869,7 +1869,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                     // convert first, then copy to device memory
                     auto processed_name = convert_tensor_name(tensor_storage.name);
                     // LOG_DEBUG("%s",processed_name.c_str());
-                    std::vector<float> imatrix = imatrix_collector ? imatrix_collector->get_values(processed_name) : std::vector<float>{};
+                    std::vector<float> imatrix = imatrix_collector.get_values(processed_name);
 
                     convert_buffer.resize(ggml_nbytes(dst_tensor));
                     convert_tensor((void*)read_buffer.data(), tensor_storage.type,
@@ -2069,10 +2069,6 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
     return mem_size;
 }
 
-void setConvertImatrixCollector(void* collector) {
-    imatrix_collector = ((IMatrixCollector*)collector);
-}
-
 bool convert(const char* model_path, const char* clip_l_path, const char* clip_g_path, const char* t5xxl_path, const char* diffusion_model_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
     ModelLoader model_loader;
 
@@ -2120,3 +2116,19 @@ bool convert(const char* model_path, const char* clip_l_path, const char* clip_g
     bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type);
     return success;
 }
+
+bool loadImatrix(const char* imatrix_path) {
+    return imatrix_collector.load_imatrix(imatrix_path);
+}
+void saveImatrix(const char* imatrix_path) {
+    imatrix_collector.save_imatrix(imatrix_path);
+}
+static bool collect_imatrix(struct ggml_tensor* t, bool ask, void* user_data) {
+    return imatrix_collector.collect_imatrix(t, ask, user_data);
+}
+void enableImatrixCollection() {
+    sd_set_backend_eval_callback((sd_graph_eval_callback_t)collect_imatrix, NULL);
+}
+void disableImatrixCollection() {
+    sd_set_backend_eval_callback(NULL, NULL);
+}
\ No newline at end of file
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 8010361a3..ea5675101 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -230,7 +230,6 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
 
-SD_API void setConvertImatrixCollector(void * collector);
 SD_API bool convert(const char* model_path, const char* clip_l_path, const char* clip_g_path, const char* t5xxl_path, const char* diffusion_model_path, const char* vae_path, const char* output_path, enum sd_type_t output_type);
 
 SD_API uint8_t* preprocess_canny(uint8_t* img,
@@ -242,6 +241,11 @@ SD_API uint8_t* preprocess_canny(uint8_t* img,
                                  float strong,
                                  bool inverse);
 
+SD_API bool loadImatrix(const char * imatrix_path);
+SD_API void saveImatrix(const char * imatrix_path);
+SD_API void enableImatrixCollection();
+SD_API void disableImatrixCollection();
+
 #ifdef __cplusplus
 }
 #endif

From 67c30e1d63ee0b4fa93ab6dc1a823df56365ae89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 29 Mar 2025 23:41:02 +0100
Subject: [PATCH 11/15] imatrix: add docs

---
 docs/imatrix.md | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 docs/imatrix.md

diff --git a/docs/imatrix.md b/docs/imatrix.md
new file mode 100644
index 000000000..e07b85abb
--- /dev/null
+++ b/docs/imatrix.md
@@ -0,0 +1,59 @@
+# Importance Matrix (imatrix) Quantization
+
+## What is an Importance Matrix?
+
+Quantization reduces the precision of a model's weights, decreasing its size and computational requirements. However, this can lead to a loss of quality. An importance matrix helps mitigate this by identifying which weights are *most* important for the model's performance. During quantization, these important weights are preserved with higher precision, while less important weights are quantized more aggressively.  This allows for better overall quality at a given quantization level.
+
+This originates from work done with language models in [llama.cpp](https://github.com/ggml-org/llama.cpp/blob/master/examples/imatrix/README.md).
+
+## Usage
+
+The imatrix feature involves two main steps: *training* the matrix and *using* it during quantization.
+
+### Training the Importance Matrix
+
+To generate an imatrix, run stable-diffusion.cpp with the `--imat-out` flag, specifying the output filename.  This process runs alongside normal image generation.
+
+```bash
+sd.exe [same exact parameters as normal generation] --imat-out imatrix.dat
+```
+
+*   **`[same exact parameters as normal generation]`**:  Use the same command-line arguments you would normally use for image generation (e.g., prompt, dimensions, sampling method, etc.).
+*   **`--imat-out imatrix.dat`**: Specifies the output file for the generated imatrix.
+
+You can generate multiple images at once using the `-b` flag to speed up the training process.
+
+### Continuing Training an Existing Matrix
+
+If you want to refine an existing imatrix, use the `--imat-in` flag *in addition* to `--imat-out`. This will load the existing matrix and continue training it.
+
+```bash
+sd.exe [same exact parameters as normal generation] --imat-out imatrix.dat --imat-in imatrix.dat
+```
+With that, you can train and refine the imatrix while generating images like you'd normally do. 
+
+### Using Multiple Matrices
+
+You can load and merge multiple imatrices together:
+
+```bash
+sd.exe [same exact parameters as normal generation] --imat-out imatrix.dat --imat-in imatrix.dat --imat-in imatrix2.dat
+```
+
+### Quantizing with an Importance Matrix
+
+To quantize a model using a trained imatrix, use the `-M convert` option (or equivalent quantization command) and the `--imat-in` flag, specifying the imatrix file.
+
+```bash
+sd.exe -M convert [same exact parameters as normal quantization] --imat-in imatrix.dat
+```
+
+*   **`[same exact parameters as normal quantization]`**: Use the same command-line arguments you would normally use for quantization (e.g., target quantization method, input/output filenames).
+*   **`--imat-in imatrix.dat`**: Specifies the imatrix file to use during quantization.  You can specify multiple `--imat-in` flags to combine multiple matrices.
+
+## Important Considerations
+
+*   The quality of the imatrix depends on the prompts and settings used during training. Use prompts and settings representative of the types of images you intend to generate for the best results.
+*   Experiment with different training parameters (e.g., number of images, prompt variations) to optimize the imatrix for your specific use case.
+*   The performance impact of training an imatrix during image generation or using an imatrix for quantization is negligible.
+*   Using already quantized models to train the imatrix seems to be working fine.
\ No newline at end of file

From 2dc5dfb491352f4d54cf6f32cb39dab9b09d3d98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 29 Mar 2025 23:54:09 +0100
Subject: [PATCH 12/15] Avoid redefinition of ggml_log_callback_default

---
 ggml_extend.hpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 115f0555a..74c1ea326 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -119,13 +119,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct g
                     b);
 }
 
-__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void* user_data) {
-    (void)level;
-    (void)user_data;
-    fputs(text, stderr);
-    fflush(stderr);
-}
-
 __STATIC_INLINE__ void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
     uint32_t n                        = (uint32_t)ggml_nelements(tensor);
     std::vector<float> random_numbers = rng->randn(n);

From 8c0b8e14b564d553c25c9223319da34ddafd08cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Mon, 31 Mar 2025 21:44:20 +0200
Subject: [PATCH 13/15] Fix typos

---
 examples/cli/main.cpp | 4 ++--
 imatrix.cpp           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index ac446f5d8..15cce84dd 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -210,8 +210,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
     printf("  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n");
     printf("                                     If not specified, the default is the type of the weight file\n");
-    printf("  --imat-out [PATH]                  If set, compute the imatrix for this run and save it to the provided path");
-    printf("  --imat-in [PATH]                   Use imatrix for quantization.");
+    printf("  --imat-out [PATH]                  If set, compute the imatrix for this run and save it to the provided path\n");
+    printf("  --imat-in [PATH]                   Use imatrix for quantization.\n");
     printf("  --lora-model-dir [DIR]             lora model directory\n");
     printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
     printf("  --mask [MASK]                      path to the mask image, required by img2img with mask\n");
diff --git a/imatrix.cpp b/imatrix.cpp
index 5612483ed..1d8e3c047 100644
--- a/imatrix.cpp
+++ b/imatrix.cpp
@@ -271,7 +271,7 @@ bool IMatrixCollector::load_imatrix(const char* fname) {
             return false;
         }
 
-        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
+        // Recreate the state as expected by save_imatrix(), and correct for weighted sum.
         for (int i = 0; i < nval; i++) {
             e.values[i] += tmp[i];
             e.counts[i] += ncall;

From 71eed146cd78ab771761888169cab7d82d90a5bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Mon, 31 Mar 2025 21:44:53 +0200
Subject: [PATCH 14/15] forgot to use imatrix is some cases

---
 model.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model.cpp b/model.cpp
index 9672a978a..7673313d0 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1874,7 +1874,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                     convert_buffer.resize(ggml_nbytes(dst_tensor));
                     convert_tensor((void*)read_buffer.data(), tensor_storage.type,
                                    (void*)convert_buffer.data(), dst_tensor->type,
-                                   (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
+                                   (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0], imatrix);
                     ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
                 }
             }

From a386ba9f2829e6afce47df7cf22db26dc46d3acd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Tue, 1 Apr 2025 14:17:03 +0200
Subject: [PATCH 15/15] fix imatrix collection on CPU backend

---
 imatrix.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/imatrix.cpp b/imatrix.cpp
index 1d8e3c047..db786fdfb 100644
--- a/imatrix.cpp
+++ b/imatrix.cpp
@@ -42,7 +42,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* us
             return false;
         // why are small batches ignored (<16 tokens)?
         // if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
-        if (!(wname.substr(0, 6) == "model." || wname.substr(0, 17) == "cond_stage_model." || wname.substr(0,14) == "text_encoders."))
+        if (!(wname.substr(0, 6) == "model." || wname.substr(0, 17) == "cond_stage_model." || wname.substr(0, 14) == "text_encoders."))
             return false;
         return true;
     }
@@ -51,7 +51,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* us
     std::lock_guard<std::mutex> lock(m_mutex);
 
     // copy the data from the GPU memory if needed
-    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
+    const bool is_host = src1->buffer == NULL || ggml_backend_buffer_is_host(src1->buffer);
 
     if (!is_host) {
         m_src1_data.resize(ggml_nelements(src1));
@@ -144,10 +144,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* us
         }
     }
     return true;
-
 }
 
-void IMatrixCollector::save_imatrix(std::string fname,int ncall) const {
+void IMatrixCollector::save_imatrix(std::string fname, int ncall) const {
     LOG_INFO("SAVING_IMATRIX to %s\n", fname.c_str());
 
     if (ncall > 0) {