From 4018e59b2a21f039e05e9f3495d7fa4f82caacc2 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Tue, 2 Dec 2025 17:28:26 +0000
Subject: [PATCH] feat(stablediffusion): Passthrough more parameters to support
 z-image and flux2 (#7414)

Signed-off-by: Richard Palethorpe <io@richiejp.com>
---
 backend/go/stablediffusion-ggml/gosd.cpp | 242 ++++++++++++++++++++++-
 gallery/index.yaml                       |   3 +
 2 files changed, 237 insertions(+), 8 deletions(-)
diff --git a/backend/go/stablediffusion-ggml/gosd.cpp b/backend/go/stablediffusion-ggml/gosd.cpp
index 768894470..29172ab02 100644
--- a/backend/go/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/stablediffusion-ggml/gosd.cpp
@@ -1,4 +1,5 @@
 #include "stable-diffusion.h"
+#include <cmath>
 #include <cstdint>
 #define GGML_MAX_NAME 128
 
@@ -21,6 +22,7 @@
 #define STB_IMAGE_RESIZE_IMPLEMENTATION
 #define STB_IMAGE_RESIZE_STATIC
 #include "stb_image_resize.h"
+#include <stdlib.h>
 
 // Names of the sampler method, same order as enum sample_method in stable-diffusion.h
 const char* sample_method_str[] = {
@@ -55,6 +57,68 @@ const char* schedulers[] = {
 
 static_assert(std::size(schedulers) == SCHEDULER_COUNT, "schedulers mismatch");
 
+// New enum string arrays
+const char* rng_type_str[] = {
+    "std_default",
+    "cuda",
+    "cpu",
+};
+static_assert(std::size(rng_type_str) == RNG_TYPE_COUNT, "rng type mismatch");
+
+const char* prediction_str[] = {
+    "default",
+    "epsilon",
+    "v",
+    "edm_v",
+    "sd3_flow",
+    "flux_flow",
+    "flux2_flow",
+};
+static_assert(std::size(prediction_str) == PREDICTION_COUNT, "prediction mismatch");
+
+const char* lora_apply_mode_str[] = {
+    "auto",
+    "immediately",
+    "at_runtime",
+};
+static_assert(std::size(lora_apply_mode_str) == LORA_APPLY_MODE_COUNT, "lora apply mode mismatch");
+
+const char* sd_type_str[] = {
+    [0] = "f32",
+    [1] = "f16",
+    [2] = "q4_0",
+    [3] = "q4_1",
+    [6] = "q5_0",
+    [7] = "q5_1",
+    [8] = "q8_0",
+    [9] = "q8_1",
+    [10] = "q2_k",
+    [11] = "q3_k",
+    [12] = "q4_k",
+    [13] = "q5_k",
+    [14] = "q6_k",
+    [15] = "q8_k",
+    [16] = "iq2_xxs",
+    [17] = "iq2_xs",
+    [18] = "iq3_xxs",
+    [19] = "iq1_s",
+    [20] = "iq4_nl",
+    [21] = "iq3_s",
+    [22] = "iq2_s",
+    [23] = "iq4_xs",
+    [24] = "i8",
+    [25] = "i16",
+    [26] = "i32",
+    [27] = "i64",
+    [28] = "f64",
+    [29] = "iq1_m",
+    [30] = "bf16",
+    [34] = "tq1_0",
+    [35] = "tq2_0",
+    [39] = "mxfp4",
+};
+static_assert(std::size(sd_type_str) == SD_TYPE_COUNT, "sd type mismatch");
+
 sd_ctx_t* sd_c;
 // Moved from the context (load time) to generation time params
 scheduler_t scheduler = SCHEDULER_COUNT;
@@ -110,9 +174,41 @@ int load_model(const char *model, char *model_path, char* options[], int threads
     const char *vae_path  = "";
     const char *scheduler_str = "";
     const char *sampler = "";
+    const char *clip_vision_path = "";
+    const char *llm_path = "";
+    const char *llm_vision_path = "";
+    const char *diffusion_model_path = stableDiffusionModel;
+    const char *high_noise_diffusion_model_path = "";
+    const char *taesd_path  = "";
+    const char *control_net_path = "";
+    const char *embedding_dir = "";
+    const char *photo_maker_path = "";
+    const char *tensor_type_rules = "";
     char *lora_dir = model_path;
     bool lora_dir_allocated = false;
 
+    bool vae_decode_only = true;
+    bool free_params_immediately = true;
+    int n_threads = threads;
+    enum sd_type_t wtype = SD_TYPE_COUNT;
+    enum rng_type_t rng_type = STD_DEFAULT_RNG;
+    enum rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
+    enum prediction_t prediction = PREDICTION_COUNT;
+    enum lora_apply_mode_t lora_apply_mode = LORA_APPLY_MODE_COUNT;
+    bool offload_params_to_cpu = false;
+    bool keep_clip_on_cpu = false;
+    bool keep_control_net_on_cpu = false;
+    bool keep_vae_on_cpu = false;
+    bool diffusion_flash_attn = false;
+    bool tae_preview_only = false;
+    bool diffusion_conv_direct = false;
+    bool vae_conv_direct = false;
+    bool force_sdxl_vae_conv_scale = false;
+    bool chroma_use_dit_mask = true;
+    bool chroma_use_t5_mask = false;
+    int chroma_t5_mask_pad = 0;
+    float flow_shift = INFINITY;
+
     fprintf(stderr, "parsing options: %p\n", options);
 
     // If options is not NULL, parse options
@@ -156,6 +252,113 @@ int load_model(const char *model, char *model_path, char* options[], int threads
                 fprintf(stderr, "No model path provided, using lora dir as-is: %s\n", lora_dir);
             }
         }
+
+        // New parsing
+        if (!strcmp(optname, "clip_vision_path")) clip_vision_path = optval;
+        if (!strcmp(optname, "llm_path")) llm_path = optval;
+        if (!strcmp(optname, "llm_vision_path")) llm_vision_path = optval;
+        if (!strcmp(optname, "diffusion_model_path")) diffusion_model_path = optval;
+        if (!strcmp(optname, "high_noise_diffusion_model_path")) high_noise_diffusion_model_path = optval;
+        if (!strcmp(optname, "taesd_path")) taesd_path = optval;
+        if (!strcmp(optname, "control_net_path")) control_net_path = optval;
+        if (!strcmp(optname, "embedding_dir")) embedding_dir = optval;
+        if (!strcmp(optname, "photo_maker_path")) photo_maker_path = optval;
+        if (!strcmp(optname, "tensor_type_rules")) tensor_type_rules = optval;
+
+        if (!strcmp(optname, "vae_decode_only")) vae_decode_only = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "free_params_immediately")) free_params_immediately = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "offload_params_to_cpu")) offload_params_to_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "keep_clip_on_cpu")) keep_clip_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "keep_control_net_on_cpu")) keep_control_net_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "keep_vae_on_cpu")) keep_vae_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "diffusion_flash_attn")) diffusion_flash_attn = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "tae_preview_only")) tae_preview_only = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "diffusion_conv_direct")) diffusion_conv_direct = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "vae_conv_direct")) vae_conv_direct = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "force_sdxl_vae_conv_scale")) force_sdxl_vae_conv_scale = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "chroma_use_dit_mask")) chroma_use_dit_mask = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        if (!strcmp(optname, "chroma_use_t5_mask")) chroma_use_t5_mask = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+
+        if (!strcmp(optname, "n_threads")) n_threads = atoi(optval);
+        if (!strcmp(optname, "chroma_t5_mask_pad")) chroma_t5_mask_pad = atoi(optval);
+
+        if (!strcmp(optname, "flow_shift")) flow_shift = atof(optval);
+
+        if (!strcmp(optname, "rng_type")) {
+            int found = -1;
+            for (int m = 0; m < RNG_TYPE_COUNT; m++) {
+                if (!strcmp(optval, rng_type_str[m])) {
+                    found = m;
+                    break;
+                }
+            }
+            if (found != -1) {
+                rng_type = (rng_type_t)found;
+                fprintf(stderr, "Found rng_type: %s\n", optval);
+            } else {
+                fprintf(stderr, "Invalid rng_type: %s, using default\n", optval);
+            }
+        }
+        if (!strcmp(optname, "sampler_rng_type")) {
+            int found = -1;
+            for (int m = 0; m < RNG_TYPE_COUNT; m++) {
+                if (!strcmp(optval, rng_type_str[m])) {
+                    found = m;
+                    break;
+                }
+            }
+            if (found != -1) {
+                sampler_rng_type = (rng_type_t)found;
+                fprintf(stderr, "Found sampler_rng_type: %s\n", optval);
+            } else {
+                fprintf(stderr, "Invalid sampler_rng_type: %s, using default\n", optval);
+            }
+        }
+        if (!strcmp(optname, "prediction")) {
+            int found = -1;
+            for (int m = 0; m < PREDICTION_COUNT; m++) {
+                if (!strcmp(optval, prediction_str[m])) {
+                    found = m;
+                    break;
+                }
+            }
+            if (found != -1) {
+                prediction = (prediction_t)found;
+                fprintf(stderr, "Found prediction: %s\n", optval);
+            } else {
+                fprintf(stderr, "Invalid prediction: %s, using default\n", optval);
+            }
+        }
+        if (!strcmp(optname, "lora_apply_mode")) {
+            int found = -1;
+            for (int m = 0; m < LORA_APPLY_MODE_COUNT; m++) {
+                if (!strcmp(optval, lora_apply_mode_str[m])) {
+                    found = m;
+                    break;
+                }
+            }
+            if (found != -1) {
+                lora_apply_mode = (lora_apply_mode_t)found;
+                fprintf(stderr, "Found lora_apply_mode: %s\n", optval);
+            } else {
+                fprintf(stderr, "Invalid lora_apply_mode: %s, using default\n", optval);
+            }
+        }
+        if (!strcmp(optname, "wtype")) {
+            int found = -1;
+            for (int m = 0; m < SD_TYPE_COUNT; m++) {
+                if (sd_type_str[m] && !strcmp(optval, sd_type_str[m])) {
+                    found = m;
+                    break;
+                }
+            }
+            if (found != -1) {
+                wtype = (sd_type_t)found;
+                fprintf(stderr, "Found wtype: %s\n", optval);
+            } else {
+                fprintf(stderr, "Invalid wtype: %s, using default\n", optval);
+            }
+        }
     }
 
     fprintf(stderr, "parsed options\n");
@@ -166,17 +369,40 @@ int load_model(const char *model, char *model_path, char* options[], int threads
     ctx_params.model_path = model;
     ctx_params.clip_l_path = clip_l_path;
     ctx_params.clip_g_path = clip_g_path;
+    ctx_params.clip_vision_path = clip_vision_path;
     ctx_params.t5xxl_path = t5xxl_path;
-    ctx_params.diffusion_model_path = stableDiffusionModel;
+    ctx_params.llm_path = llm_path;
+    ctx_params.llm_vision_path = llm_vision_path;
+    ctx_params.diffusion_model_path = diffusion_model_path;
+    ctx_params.high_noise_diffusion_model_path = high_noise_diffusion_model_path;
     ctx_params.vae_path = vae_path;
-    ctx_params.taesd_path = "";
-    ctx_params.control_net_path = "";
+    ctx_params.taesd_path = taesd_path;
+    ctx_params.control_net_path = control_net_path;
     ctx_params.lora_model_dir = lora_dir;
-    ctx_params.embedding_dir = "";
-    ctx_params.vae_decode_only = false;
-    ctx_params.free_params_immediately = false;
-    ctx_params.n_threads = threads;
-    ctx_params.rng_type = STD_DEFAULT_RNG;
+    ctx_params.embedding_dir = embedding_dir;
+    ctx_params.photo_maker_path = photo_maker_path;
+    ctx_params.tensor_type_rules = tensor_type_rules;
+    ctx_params.vae_decode_only = vae_decode_only;
+    ctx_params.free_params_immediately = free_params_immediately;
+    ctx_params.n_threads = n_threads;
+    ctx_params.rng_type = rng_type;
+    ctx_params.keep_clip_on_cpu = keep_clip_on_cpu;
+    if (wtype != SD_TYPE_COUNT) ctx_params.wtype = wtype;
+    if (sampler_rng_type != RNG_TYPE_COUNT) ctx_params.sampler_rng_type = sampler_rng_type;
+    if (prediction != PREDICTION_COUNT) ctx_params.prediction = prediction;
+    if (lora_apply_mode != LORA_APPLY_MODE_COUNT) ctx_params.lora_apply_mode = lora_apply_mode;
+    ctx_params.offload_params_to_cpu = offload_params_to_cpu;
+    ctx_params.keep_control_net_on_cpu = keep_control_net_on_cpu;
+    ctx_params.keep_vae_on_cpu = keep_vae_on_cpu;
+    ctx_params.diffusion_flash_attn = diffusion_flash_attn;
+    ctx_params.tae_preview_only = tae_preview_only;
+    ctx_params.diffusion_conv_direct = diffusion_conv_direct;
+    ctx_params.vae_conv_direct = vae_conv_direct;
+    ctx_params.force_sdxl_vae_conv_scale = force_sdxl_vae_conv_scale;
+    ctx_params.chroma_use_dit_mask = chroma_use_dit_mask;
+    ctx_params.chroma_use_t5_mask = chroma_use_t5_mask;
+    ctx_params.chroma_t5_mask_pad = chroma_t5_mask_pad;
+    ctx_params.flow_shift = flow_shift;
     sd_ctx_t* sd_ctx = new_sd_ctx(&ctx_params);
 
     if (sd_ctx == NULL) {
diff --git a/gallery/index.yaml b/gallery/index.yaml
index 610a8d70c..17d18f8f0 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -20911,6 +20911,9 @@
   overrides:
     parameters:
       model: flux1-dev-Q2_K.gguf
+    options:
+      - scheduler:simple
+      - keep_clip_on_cpu:true
   files:
     - filename: "flux1-dev-Q2_K.gguf"
       sha256: "b8c464bc0f10076ef8f00ba040d220d90c7993f7c4245ae80227d857f65df105"