diff --git a/backend/go/stablediffusion-ggml/Makefile b/backend/go/stablediffusion-ggml/Makefile index 50c242897..9f768489b 100644 --- a/backend/go/stablediffusion-ggml/Makefile +++ b/backend/go/stablediffusion-ggml/Makefile @@ -8,10 +8,16 @@ JOBS?=$(shell nproc --ignore=1) # stablediffusion.cpp (ggml) STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp -STABLEDIFFUSION_GGML_VERSION?=276025e054555166ec419413c6748ca79986ee93 +STABLEDIFFUSION_GGML_VERSION?=5a34bc7f6e0621dd2f899daa64476eac667d7ed3 CMAKE_ARGS+=-DGGML_MAX_NAME=128 +# Enable the ggml RPC backend so generation can be sharded across remote +# rpc-server workers (the same backend-agnostic ggml rpc-server used by the +# llama.cpp backend). Servers are selected via the `rpc_servers` option or the +# LLAMACPP_GRPC_SERVERS env var (populated automatically in p2p worker mode). +CMAKE_ARGS+=-DSD_RPC=ON + ifeq ($(NATIVE),false) CMAKE_ARGS+=-DGGML_NATIVE=OFF endif diff --git a/backend/go/stablediffusion-ggml/cpp/gosd.cpp b/backend/go/stablediffusion-ggml/cpp/gosd.cpp index 607d3354c..93b3b16da 100644 --- a/backend/go/stablediffusion-ggml/cpp/gosd.cpp +++ b/backend/go/stablediffusion-ggml/cpp/gosd.cpp @@ -391,10 +391,18 @@ int load_model(const char *model, char *model_path, char* options[], int threads const char *control_net_path = ""; const char *embedding_dir = ""; const char *photo_maker_path = ""; + const char *pulid_weights_path = ""; const char *tensor_type_rules = ""; char *lora_dir = model_path; - bool vae_decode_only = true; + // Upstream backend/parameter placement specs (see docs/.../stablediffusion). + // Empty means "leave at upstream default" (nullptr). + const char *backend_arg = ""; + const char *params_backend_arg = ""; + const char *rpc_servers_arg = ""; + const char *max_vram_arg = ""; + bool stream_layers = false; + int n_threads = threads; enum sd_type_t wtype = SD_TYPE_COUNT; enum rng_type_t rng_type = CUDA_RNG; @@ -418,7 +426,9 @@ int load_model(const char *model, char *model_path, char* options[], int threads // If options is not NULL, parse options for (int i = 0; options[i] != NULL; i++) { const char *optname = strtok(options[i], ":"); - const char *optval = strtok(NULL, ":"); + // Take everything after the first ':' as the value so values may + // themselves contain colons (e.g. rpc_servers host:port lists). + const char *optval = strtok(NULL, ""); if (optval == NULL) { optval = "true"; } @@ -490,9 +500,21 @@ int load_model(const char *model, char *model_path, char* options[], int threads } } if (!strcmp(optname, "photo_maker_path")) photo_maker_path = strdup(optval); + if (!strcmp(optname, "pulid_weights_path")) pulid_weights_path = strdup(optval); if (!strcmp(optname, "tensor_type_rules")) tensor_type_rules = strdup(optval); - if (!strcmp(optname, "vae_decode_only")) vae_decode_only = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0); + // Backend / parameter placement specs (see prepare_backend_assignments + // in the upstream CLI). These compose with the legacy keep_*_on_cpu / + // offload_params_to_cpu booleans below. + if (!strcmp(optname, "backend")) backend_arg = strdup(optval); + if (!strcmp(optname, "params_backend")) params_backend_arg = strdup(optval); + if (!strcmp(optname, "rpc_servers")) rpc_servers_arg = strdup(optval); + if (!strcmp(optname, "max_vram")) max_vram_arg = strdup(optval); + if (!strcmp(optname, "stream_layers")) stream_layers = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0); + + // vae_decode_only is still accepted for backwards compatibility with + // existing gallery configs, but upstream dropped the option (the model + // now decides), so it is parsed and ignored. if (!strcmp(optname, "offload_params_to_cpu")) offload_params_to_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0); if (!strcmp(optname, "keep_clip_on_cpu")) keep_clip_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0); if (!strcmp(optname, "keep_control_net_on_cpu")) keep_control_net_on_cpu = (strcmp(optval, "true") == 0 || strcmp(optval, "1") == 0); @@ -591,20 +613,48 @@ int load_model(const char *model, char *model_path, char* options[], int threads ctx_params.embeddings = embedding_vec.empty() ? NULL : embedding_vec.data(); ctx_params.embedding_count = static_cast(embedding_vec.size()); ctx_params.photo_maker_path = photo_maker_path; + if (strlen(pulid_weights_path) > 0) ctx_params.pulid_weights_path = pulid_weights_path; ctx_params.tensor_type_rules = tensor_type_rules; - ctx_params.vae_decode_only = vae_decode_only; - // XXX: Setting to true causes a segfault on the second run - ctx_params.free_params_immediately = false; ctx_params.n_threads = n_threads; ctx_params.rng_type = rng_type; - ctx_params.keep_clip_on_cpu = keep_clip_on_cpu; if (wtype != SD_TYPE_COUNT) ctx_params.wtype = wtype; if (sampler_rng_type != RNG_TYPE_COUNT) ctx_params.sampler_rng_type = sampler_rng_type; if (prediction != PREDICTION_COUNT) ctx_params.prediction = prediction; if (lora_apply_mode != LORA_APPLY_MODE_COUNT) ctx_params.lora_apply_mode = lora_apply_mode; - ctx_params.offload_params_to_cpu = offload_params_to_cpu; - ctx_params.keep_control_net_on_cpu = keep_control_net_on_cpu; - ctx_params.keep_vae_on_cpu = keep_vae_on_cpu; + // Backend / parameter placement specs. Upstream replaced the boolean + // CPU-offload knobs (offload_params_to_cpu, keep_clip_on_cpu, keep_vae_on_cpu, + // keep_control_net_on_cpu) with these specs. Seed from the explicit + // backend/params_backend options, then prepend the legacy boolean-derived + // assignments, mirroring prepare_backend_assignments() in the upstream CLI. + // These strings must outlive new_sd_ctx() below. + std::string backend_spec = backend_arg; + std::string params_backend_spec = params_backend_arg; + auto prepend_spec = [](std::string& spec, const char* assignment) { + spec = spec.empty() ? std::string(assignment) : std::string(assignment) + "," + spec; + }; + if (offload_params_to_cpu) prepend_spec(params_backend_spec, "*=cpu"); + if (keep_clip_on_cpu) prepend_spec(backend_spec, "te=cpu"); + if (keep_vae_on_cpu) prepend_spec(backend_spec, "vae=cpu"); + if (keep_control_net_on_cpu) prepend_spec(backend_spec, "controlnet=cpu"); + if (!backend_spec.empty()) ctx_params.backend = backend_spec.c_str(); + if (!params_backend_spec.empty()) ctx_params.params_backend = params_backend_spec.c_str(); + // RPC servers: prefer the explicit option, otherwise fall back to the + // LLAMACPP_GRPC_SERVERS env var. LocalAI's p2p worker mode populates that + // var with discovered ggml rpc-server workers (shared with the llama.cpp + // backend), so distributed image generation works with no extra config. + if (strlen(rpc_servers_arg) > 0) { + ctx_params.rpc_servers = rpc_servers_arg; + } else { + const char* env_rpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS"); + if (env_rpc_servers != NULL && strlen(env_rpc_servers) > 0) { + ctx_params.rpc_servers = env_rpc_servers; + } + } + // max_vram: GiB budget or per-backend spec for graph-cut segmented param + // offload ("0" = disabled, "-1" = auto). stream_layers only has effect when + // max_vram is set. + if (strlen(max_vram_arg) > 0) ctx_params.max_vram = max_vram_arg; + ctx_params.stream_layers = stream_layers; ctx_params.diffusion_flash_attn = diffusion_flash_attn; ctx_params.tae_preview_only = tae_preview_only; ctx_params.diffusion_conv_direct = diffusion_conv_direct; diff --git a/docs/content/features/image-generation.md b/docs/content/features/image-generation.md index bb9748dd9..9430a3a3b 100644 --- a/docs/content/features/image-generation.md +++ b/docs/content/features/image-generation.md @@ -71,6 +71,63 @@ options: 2. Download the required assets to the `models` repository 3. Start LocalAI +#### Memory and device placement options + +When a model does not fit entirely in VRAM, the following `options:` control where weights and computation are placed. They map directly to the upstream stable-diffusion.cpp options. + +| Option | Example | Description | +|--------|---------|-------------| +| `backend` | `backend:clip=cpu,vae=cuda0,diffusion=vulkan0` | Runtime (compute) backend assignment per component. Use `cpu` to place a component's compute on the CPU. Component keys include `te` (text encoder / CLIP), `vae`, `diffusion`, `controlnet`. | +| `params_backend` | `params_backend:diffusion=disk,clip=cpu` | Where parameters (weights) are stored. Supports `cpu`, `disk` (mmap weights from disk to save RAM/VRAM), or per-component specs. | +| `max_vram` | `max_vram:8` or `max_vram:-1` | VRAM budget (in GiB) for graph-cut segmented parameter offload. `0` disables it, `-1` auto-selects (free VRAM minus ~1 GiB). Also accepts per-backend budgets. | +| `stream_layers` | `stream_layers:true` | Enable residency + prefetch streaming on top of `max_vram` (no effect unless `max_vram` is set). | +| `rpc_servers` | `rpc_servers:localhost:50052,192.168.1.3:50052` | Comma-separated list of `host:port` RPC servers to offload compute to. | +| `pulid_weights_path` | `pulid_weights_path:pulid.safetensors` | Path to PuLID-Flux weights for identity injection. | + +The following convenience booleans are still accepted and are translated into the `backend` / `params_backend` specs above: + +| Option | Equivalent spec | +|--------|-----------------| +| `offload_params_to_cpu:true` | `params_backend` += `*=cpu` | +| `keep_clip_on_cpu:true` | `backend` += `te=cpu` | +| `keep_vae_on_cpu:true` | `backend` += `vae=cpu` | +| `keep_control_net_on_cpu:true` | `backend` += `controlnet=cpu` | + +For example, to mmap the diffusion weights from disk while keeping the text encoder on the CPU: + +```yaml +options: +- "diffusion_model" +- "sampler:euler" +- "params_backend:diffusion=disk" +- "keep_clip_on_cpu:true" +``` + +{{% alert note %}} +`vae_decode_only` is still accepted for backwards compatibility but is now a no-op: upstream removed the flag and the model decides automatically. +{{% /alert %}} + +#### Distributed inference (RPC workers) + +The `stablediffusion-ggml` backend can offload computation to remote `ggml` RPC workers, sharding a model that does not fit on a single machine. It reuses the **same backend-agnostic `rpc-server` workers as the llama.cpp backend**, so one worker pool can serve both. + +**Manual:** point the model at running workers with the `rpc_servers` option: + +```yaml +options: +- "rpc_servers:192.168.1.10:50052,192.168.1.11:50052" +``` + +Start a worker on each remote machine the same way you would for llama.cpp: + +```bash +local-ai worker llama-cpp-rpc --llama-cpp-args="--host 0.0.0.0 --port 50052" +``` + +**Automatic (peer-to-peer):** when LocalAI runs in [p2p worker mode]({{%relref "features/distributed_inferencing" %}}), discovered workers are published in the `LLAMACPP_GRPC_SERVERS` environment variable. The image-generation backend reads that variable automatically (when `rpc_servers` is not set), so the same `local-ai worker p2p-llama-cpp-rpc` workers used for text generation also accelerate image generation - no per-model configuration needed. + +By default the RPC devices join the pool and participate in placement; combine with the `backend` / `params_backend` options above to pin specific components to them (e.g. `backend:diffusion=rpc0`). + ### Diffusers