diff --git a/backend/go/stablediffusion-ggml/Makefile b/backend/go/stablediffusion-ggml/Makefile
index 50c242897..9f768489b 100644
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,10 +8,16 @@ JOBS?=$(shell nproc --ignore=1)
 
 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=276025e054555166ec419413c6748ca79986ee93
+STABLEDIFFUSION_GGML_VERSION?=5a34bc7f6e0621dd2f899daa64476eac667d7ed3
 
 CMAKE_ARGS+=-DGGML_MAX_NAME=128
 
+# Enable the ggml RPC backend so generation can be sharded across remote
+# rpc-server workers (the same backend-agnostic ggml rpc-server used by the
+# llama.cpp backend). Servers are selected via the `rpc_servers` option or the
+# LLAMACPP_GRPC_SERVERS env var (populated automatically in p2p worker mode).
+CMAKE_ARGS+=-DSD_RPC=ON
+
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
diff --git a/backend/go/stablediffusion-ggml/cpp/gosd.cpp b/backend/go/stablediffusion-ggml/cpp/gosd.cpp
index 607d3354c..93b3b16da 100644
--- a/backend/go/stablediffusion-ggml/cpp/gosd.cpp
+++ b/backend/go/stablediffusion-ggml/cpp/gosd.cpp
@@ -391,10 +391,18 @@ int load_model(const char *model, char *model_path, char* options[], int threads
     const char *control_net_path = "";
     const char *embedding_dir = "";
     const char *photo_maker_path = "";
+    const char *pulid_weights_path = "";
     const char *tensor_type_rules = "";
     char *lora_dir = model_path;
 
-    bool vae_decode_only = true;
+    // Upstream backend/parameter placement specs (see docs/.../stablediffusion).
+    // Empty means "leave at upstream default" (nullptr).
+    const char *backend_arg = "";
+    const char *params_backend_arg = "";
+    const char *rpc_servers_arg = "";
+    const char *max_vram_arg = "";
+    bool stream_layers = false;
+
     int n_threads = threads;
     enum sd_type_t wtype = SD_TYPE_COUNT;
     enum rng_type_t rng_type = CUDA_RNG;
@@ -418,7 +426,9 @@ int load_model(const char *model, char *model_path, char* options[], int threads
     // If options is not NULL, parse options
     for (int i = 0; options[i] != NULL; i++) {
         const char *optname = strtok(options[i], ":");
-        const char *optval = strtok(NULL, ":");
+        // Take everything after the first ':' as the value so values may
+        // themselves contain colons (e.g. rpc_servers host:port lists).
+        const char *optval = strtok(NULL, "");
         if (optval == NULL) {
             optval = "true";
         }
@@ -490,9 +500,21 @@ int load_model(const char *model, char *model_path, char* options[], int threads
             }
         }
         if (!strcmp(optname, "photo_maker_path")) photo_maker_path = strdup(optval);
+        if (!strcmp(optname, "pulid_weights_path")) pulid_weights_path = strdup(optval);
         if (!strcmp(optname, "tensor_type_rules")) tensor_type_rules = strdup(optval);
 
-        if (!strcmp(optname, "vae_decode_only")) vae_decode_only = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+        // Backend / parameter placement specs (see prepare_backend_assignments
+        // in the upstream CLI). These compose with the legacy keep_*_on_cpu /
+        // offload_params_to_cpu booleans below.
+        if (!strcmp(optname, "backend")) backend_arg = strdup(optval);
+        if (!strcmp(optname, "params_backend")) params_backend_arg = strdup(optval);
+        if (!strcmp(optname, "rpc_servers")) rpc_servers_arg = strdup(optval);
+        if (!strcmp(optname, "max_vram")) max_vram_arg = strdup(optval);
+        if (!strcmp(optname, "stream_layers")) stream_layers = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
+
+        // vae_decode_only is still accepted for backwards compatibility with
+        // existing gallery configs, but upstream dropped the option (the model
+        // now decides), so it is parsed and ignored.
         if (!strcmp(optname, "offload_params_to_cpu")) offload_params_to_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
         if (!strcmp(optname, "keep_clip_on_cpu")) keep_clip_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
         if (!strcmp(optname, "keep_control_net_on_cpu")) keep_control_net_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0);
@@ -591,20 +613,48 @@ int load_model(const char *model, char *model_path, char* options[], int threads
     ctx_params.embeddings = embedding_vec.empty() ? NULL : embedding_vec.data();
     ctx_params.embedding_count = static_cast<uint32_t>(embedding_vec.size());
     ctx_params.photo_maker_path = photo_maker_path;
+    if (strlen(pulid_weights_path) > 0) ctx_params.pulid_weights_path = pulid_weights_path;
     ctx_params.tensor_type_rules = tensor_type_rules;
-    ctx_params.vae_decode_only = vae_decode_only;
-    // XXX: Setting to true causes a segfault on the second run
-    ctx_params.free_params_immediately = false;
     ctx_params.n_threads = n_threads;
     ctx_params.rng_type = rng_type;
-    ctx_params.keep_clip_on_cpu = keep_clip_on_cpu;
     if (wtype != SD_TYPE_COUNT) ctx_params.wtype = wtype;
     if (sampler_rng_type != RNG_TYPE_COUNT) ctx_params.sampler_rng_type = sampler_rng_type;
     if (prediction != PREDICTION_COUNT) ctx_params.prediction = prediction;
     if (lora_apply_mode != LORA_APPLY_MODE_COUNT) ctx_params.lora_apply_mode = lora_apply_mode;
-    ctx_params.offload_params_to_cpu = offload_params_to_cpu;
-    ctx_params.keep_control_net_on_cpu = keep_control_net_on_cpu;
-    ctx_params.keep_vae_on_cpu = keep_vae_on_cpu;
+    // Backend / parameter placement specs. Upstream replaced the boolean
+    // CPU-offload knobs (offload_params_to_cpu, keep_clip_on_cpu, keep_vae_on_cpu,
+    // keep_control_net_on_cpu) with these specs. Seed from the explicit
+    // backend/params_backend options, then prepend the legacy boolean-derived
+    // assignments, mirroring prepare_backend_assignments() in the upstream CLI.
+    // These strings must outlive new_sd_ctx() below.
+    std::string backend_spec = backend_arg;
+    std::string params_backend_spec = params_backend_arg;
+    auto prepend_spec = [](std::string& spec, const char* assignment) {
+        spec = spec.empty() ? std::string(assignment) : std::string(assignment) + "," + spec;
+    };
+    if (offload_params_to_cpu) prepend_spec(params_backend_spec, "*=cpu");
+    if (keep_clip_on_cpu) prepend_spec(backend_spec, "te=cpu");
+    if (keep_vae_on_cpu) prepend_spec(backend_spec, "vae=cpu");
+    if (keep_control_net_on_cpu) prepend_spec(backend_spec, "controlnet=cpu");
+    if (!backend_spec.empty()) ctx_params.backend = backend_spec.c_str();
+    if (!params_backend_spec.empty()) ctx_params.params_backend = params_backend_spec.c_str();
+    // RPC servers: prefer the explicit option, otherwise fall back to the
+    // LLAMACPP_GRPC_SERVERS env var. LocalAI's p2p worker mode populates that
+    // var with discovered ggml rpc-server workers (shared with the llama.cpp
+    // backend), so distributed image generation works with no extra config.
+    if (strlen(rpc_servers_arg) > 0) {
+        ctx_params.rpc_servers = rpc_servers_arg;
+    } else {
+        const char* env_rpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
+        if (env_rpc_servers != NULL && strlen(env_rpc_servers) > 0) {
+            ctx_params.rpc_servers = env_rpc_servers;
+        }
+    }
+    // max_vram: GiB budget or per-backend spec for graph-cut segmented param
+    // offload ("0" = disabled, "-1" = auto). stream_layers only has effect when
+    // max_vram is set.
+    if (strlen(max_vram_arg) > 0) ctx_params.max_vram = max_vram_arg;
+    ctx_params.stream_layers = stream_layers;
     ctx_params.diffusion_flash_attn = diffusion_flash_attn;
     ctx_params.tae_preview_only = tae_preview_only;
     ctx_params.diffusion_conv_direct = diffusion_conv_direct;
diff --git a/docs/content/features/image-generation.md b/docs/content/features/image-generation.md
index bb9748dd9..9430a3a3b 100644
--- a/docs/content/features/image-generation.md
+++ b/docs/content/features/image-generation.md
@@ -71,6 +71,63 @@ options:
 2. Download the required assets to the `models` repository
 3. Start LocalAI
 
+#### Memory and device placement options
+
+When a model does not fit entirely in VRAM, the following `options:` control where weights and computation are placed. They map directly to the upstream stable-diffusion.cpp options.
+
+| Option | Example | Description |
+|--------|---------|-------------|
+| `backend` | `backend:clip=cpu,vae=cuda0,diffusion=vulkan0` | Runtime (compute) backend assignment per component. Use `cpu` to place a component's compute on the CPU. Component keys include `te` (text encoder / CLIP), `vae`, `diffusion`, `controlnet`. |
+| `params_backend` | `params_backend:diffusion=disk,clip=cpu` | Where parameters (weights) are stored. Supports `cpu`, `disk` (mmap weights from disk to save RAM/VRAM), or per-component specs. |
+| `max_vram` | `max_vram:8` or `max_vram:-1` | VRAM budget (in GiB) for graph-cut segmented parameter offload. `0` disables it, `-1` auto-selects (free VRAM minus ~1 GiB). Also accepts per-backend budgets. |
+| `stream_layers` | `stream_layers:true` | Enable residency + prefetch streaming on top of `max_vram` (no effect unless `max_vram` is set). |
+| `rpc_servers` | `rpc_servers:localhost:50052,192.168.1.3:50052` | Comma-separated list of `host:port` RPC servers to offload compute to. |
+| `pulid_weights_path` | `pulid_weights_path:pulid.safetensors` | Path to PuLID-Flux weights for identity injection. |
+
+The following convenience booleans are still accepted and are translated into the `backend` / `params_backend` specs above:
+
+| Option | Equivalent spec |
+|--------|-----------------|
+| `offload_params_to_cpu:true` | `params_backend` += `*=cpu` |
+| `keep_clip_on_cpu:true` | `backend` += `te=cpu` |
+| `keep_vae_on_cpu:true` | `backend` += `vae=cpu` |
+| `keep_control_net_on_cpu:true` | `backend` += `controlnet=cpu` |
+
+For example, to mmap the diffusion weights from disk while keeping the text encoder on the CPU:
+
+```yaml
+options:
+- "diffusion_model"
+- "sampler:euler"
+- "params_backend:diffusion=disk"
+- "keep_clip_on_cpu:true"
+```
+
+{{% alert note %}}
+`vae_decode_only` is still accepted for backwards compatibility but is now a no-op: upstream removed the flag and the model decides automatically.
+{{% /alert %}}
+
+#### Distributed inference (RPC workers)
+
+The `stablediffusion-ggml` backend can offload computation to remote `ggml` RPC workers, sharding a model that does not fit on a single machine. It reuses the **same backend-agnostic `rpc-server` workers as the llama.cpp backend**, so one worker pool can serve both.
+
+**Manual:** point the model at running workers with the `rpc_servers` option:
+
+```yaml
+options:
+- "rpc_servers:192.168.1.10:50052,192.168.1.11:50052"
+```
+
+Start a worker on each remote machine the same way you would for llama.cpp:
+
+```bash
+local-ai worker llama-cpp-rpc --llama-cpp-args="--host 0.0.0.0 --port 50052"
+```
+
+**Automatic (peer-to-peer):** when LocalAI runs in [p2p worker mode]({{%relref "features/distributed_inferencing" %}}), discovered workers are published in the `LLAMACPP_GRPC_SERVERS` environment variable. The image-generation backend reads that variable automatically (when `rpc_servers` is not set), so the same `local-ai worker p2p-llama-cpp-rpc` workers used for text generation also accelerate image generation - no per-model configuration needed.
+
+By default the RPC devices join the pool and participate in placement; combine with the `backend` / `params_backend` options above to pin specific components to them (e.g. `backend:diffusion=rpc0`).
+
 
 ### Diffusers