mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-28 02:17:00 -04:00
In distributed mode, even when the frontend and workers share the same models directory via a shared volume mount, starting a model on a worker re-staged (re-downloaded) it: stageModelFiles always uploads model files into a tracking-key-namespaced subdir on the worker, and the staging probe only checks that staged location, so a file already present on the shared volume at the canonical path was never reused. Add a config switch LOCALAI_DISTRIBUTED_SHARED_MODELS (default false). When enabled, the operator asserts that all nodes mount the SAME models directory at the SAME path, so staging is unnecessary: the frontend's absolute model paths are already valid on the worker. In that mode stageModelFiles returns the cloned opts unchanged without uploading, leaving the path fields pointing at their canonical absolute paths so the worker loads them directly from the shared volume. The value is plumbed from DistributedConfig through SmartRouterOptions into the SmartRouter. Docs and docker-compose.distributed.yaml updated. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
226 lines
8.0 KiB
YAML
226 lines
8.0 KiB
YAML
# Docker Compose for LocalAI Distributed Mode
|
|
#
|
|
# Starts a full distributed stack: PostgreSQL, NATS, a LocalAI frontend,
|
|
# and one llama-cpp backend node.
|
|
#
|
|
# Model files are transferred from the frontend to backend nodes via HTTP
|
|
# — no shared volumes needed between frontend and backends.
|
|
#
|
|
# Usage:
|
|
# docker compose -f docker-compose.distributed.yaml up
|
|
#
|
|
# See docs: https://localai.io/features/distributed-mode/
|
|
|
|
services:
|
|
# --- Infrastructure ---
|
|
|
|
postgres:
|
|
image: quay.io/mudler/localrecall:v0.5.5-postgresql # PostgreSQL with pgvector
|
|
environment:
|
|
POSTGRES_DB: localai
|
|
POSTGRES_USER: localai
|
|
POSTGRES_PASSWORD: localai
|
|
volumes:
|
|
- postgres_data:/var/lib/postgresql
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "pg_isready -U localai"]
|
|
interval: 5s
|
|
timeout: 3s
|
|
retries: 10
|
|
|
|
nats:
|
|
image: nats:2-alpine
|
|
ports:
|
|
- "4222:4222" # Client connections
|
|
- "8222:8222" # HTTP monitoring (optional, useful for debugging)
|
|
command: ["--js", "-m", "8222"] # Enable JetStream + monitoring
|
|
|
|
# --- LocalAI Frontend ---
|
|
# Stateless API server that routes requests to backend nodes.
|
|
# Add more replicas behind a load balancer for HA.
|
|
|
|
localai:
|
|
# image: localai/localai:latest-cpu
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
args:
|
|
- IMAGE_TYPE=core
|
|
- BASE_IMAGE=ubuntu:24.04
|
|
ports:
|
|
- "8080:8080"
|
|
environment:
|
|
# Distributed mode
|
|
LOCALAI_DISTRIBUTED: "true"
|
|
LOCALAI_NATS_URL: "nats://nats:4222"
|
|
LOCALAI_AGENT_POOL_EMBEDDING_MODEL: "granite-embedding-107m-multilingual"
|
|
LOCALAI_AGENT_POOL_VECTOR_ENGINE: "postgres"
|
|
LOCALAI_AGENT_POOL_DATABASE_URL: "postgresql://localai:localai@postgres:5432/localai?sslmode=disable"
|
|
LOCALAI_REGISTRATION_TOKEN: "changeme" # Change this in production!
|
|
# Shared-models mode (optional): set when every node mounts the SAME
|
|
# models directory at the SAME path (see "Shared Volume Mode" below).
|
|
# The router then skips gRPC file staging and workers load models
|
|
# directly from the shared volume instead of re-downloading them.
|
|
# LOCALAI_DISTRIBUTED_SHARED_MODELS: "true"
|
|
# Auth (required for distributed mode — must use PostgreSQL)
|
|
LOCALAI_AUTH: "true"
|
|
LOCALAI_AUTH_DATABASE_URL: "postgresql://localai:localai@postgres:5432/localai?sslmode=disable"
|
|
# Force pure-Go DNS resolver. The default cgo resolver follows the
|
|
# container's nsswitch.conf and ends up forwarding to host
|
|
# systemd-resolved (127.0.0.53), which isn't reachable from inside
|
|
# the container — failing every postgres/nats hostname lookup at
|
|
# boot. The pure-Go path reads /etc/resolv.conf directly and uses
|
|
# Docker's embedded DNS at 127.0.0.11.
|
|
GODEBUG: "netdns=go"
|
|
# Paths
|
|
MODELS_PATH: /models
|
|
volumes:
|
|
- frontend_models:/models
|
|
- frontend_data:/data
|
|
depends_on:
|
|
postgres:
|
|
condition: service_healthy
|
|
nats:
|
|
condition: service_started
|
|
|
|
# --- Worker Node ---
|
|
# A generic worker that self-registers with the frontend.
|
|
# The same LocalAI image is used — no separate image needed.
|
|
# The SmartRouter dynamically tells workers which backend to install via NATS.
|
|
#
|
|
# Model files are transferred from the frontend via HTTP file staging.
|
|
# The worker has its own independent models volume.
|
|
|
|
worker-1:
|
|
# image: localai/localai:latest-cpu
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
args:
|
|
- IMAGE_TYPE=core
|
|
- BASE_IMAGE=ubuntu:24.04
|
|
command:
|
|
- worker
|
|
# The image's default HEALTHCHECK targets the server's /readyz on 8080.
|
|
# Workers don't run the OpenAI API server — their HTTP file transfer
|
|
# server runs on the gRPC base port - 1 (50050 here) and exposes /readyz.
|
|
# Override the env var so the inherited HEALTHCHECK probes the right port.
|
|
environment:
|
|
HEALTHCHECK_ENDPOINT: "http://localhost:50050/readyz"
|
|
LOCALAI_SERVE_ADDR: "0.0.0.0:50051"
|
|
LOCALAI_ADVERTISE_ADDR: "worker-1:50051"
|
|
LOCALAI_ADVERTISE_HTTP_ADDR: "worker-1:50050"
|
|
DEBUG: "true"
|
|
LOCALAI_REGISTER_TO: "http://localai:8080"
|
|
LOCALAI_NODE_NAME: "worker-1"
|
|
LOCALAI_REGISTRATION_TOKEN: "changeme" # Must match frontend token
|
|
LOCALAI_HEARTBEAT_INTERVAL: "10s"
|
|
LOCALAI_NATS_URL: "nats://nats:4222"
|
|
GODEBUG: "netdns=go" # See note in localai service
|
|
MODELS_PATH: /models
|
|
volumes:
|
|
- worker_1_models:/models
|
|
depends_on:
|
|
localai:
|
|
condition: service_started
|
|
nats:
|
|
condition: service_started
|
|
|
|
# --- GPU Support (NVIDIA) ---
|
|
# Uncomment the following and change the image to a CUDA variant
|
|
# (e.g., localai/localai:latest-gpu-nvidia-cuda-12) to enable GPU.
|
|
#
|
|
# NVIDIA_DRIVER_CAPABILITIES must include `utility` so nvidia-smi / NVML
|
|
# are available inside the container; without it the worker cannot report
|
|
# free VRAM and the Nodes page will show 0 free / total used.
|
|
# `init: true` avoids zombie-reap races that make nvidia-smi flaky.
|
|
#
|
|
# init: true
|
|
# environment:
|
|
# NVIDIA_DRIVER_CAPABILITIES: "compute,utility"
|
|
# deploy:
|
|
# resources:
|
|
# reservations:
|
|
# devices:
|
|
# - driver: nvidia.com/gpu
|
|
# count: all
|
|
# capabilities: [gpu, utility]
|
|
|
|
# --- Shared Volume Mode (optional) ---
|
|
# If all services run on the same Docker host, you can skip gRPC file transfer
|
|
# by sharing a single models volume. Replace the volumes above with:
|
|
#
|
|
# localai:
|
|
# volumes:
|
|
# - shared_models:/models
|
|
# - frontend_data:/data
|
|
#
|
|
# backend-llama-cpp:
|
|
# volumes:
|
|
# - shared_models:/models
|
|
#
|
|
# Then add to the volumes section:
|
|
# shared_models:
|
|
#
|
|
# With shared volumes the model files are already present on every worker at
|
|
# the same path. Set LOCALAI_DISTRIBUTED_SHARED_MODELS=true on the frontend
|
|
# (see its environment above) so the router skips gRPC file staging and the
|
|
# worker loads the model directly from the shared path instead of
|
|
# re-downloading it into a per-model subdirectory.
|
|
|
|
# --- Adding More Workers ---
|
|
# Copy the worker-1 service above and change:
|
|
# - Service name (e.g., worker-2)
|
|
# - LOCALAI_NODE_NAME (must be unique)
|
|
# - LOCALAI_ADVERTISE_ADDR (must match service name)
|
|
#
|
|
# Workers are generic — no backend type needed. The SmartRouter
|
|
# will dynamically install the required backend via NATS when
|
|
# a model request arrives.
|
|
|
|
# --- Agent Worker ---
|
|
# Dedicated process for agent chat execution.
|
|
# Receives chat jobs from NATS, runs cogito LLM calls via the LocalAI API,
|
|
# and publishes results back via NATS for SSE delivery.
|
|
# No database access needed — config and skills are sent in the NATS payload.
|
|
|
|
agent-worker-1:
|
|
# image: localai/localai:latest-cpu
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
args:
|
|
- IMAGE_TYPE=core
|
|
- BASE_IMAGE=ubuntu:24.04
|
|
# Install Docker CLI and start agent-worker.
|
|
# The Docker socket is mounted from the host so that MCP stdio servers
|
|
# using "docker run" commands can spawn containers on the host Docker.
|
|
entrypoint: ["/bin/sh", "-c"]
|
|
command:
|
|
- |
|
|
apt-get update -qq && apt-get install -y -qq docker.io >/dev/null 2>&1
|
|
exec /entrypoint.sh agent-worker
|
|
# The agent worker is NATS-only — no HTTP server to probe. Disable the
|
|
# image's inherited HEALTHCHECK so the container doesn't show unhealthy.
|
|
healthcheck:
|
|
disable: true
|
|
environment:
|
|
LOCALAI_NATS_URL: "nats://nats:4222"
|
|
LOCALAI_REGISTER_TO: "http://localai:8080"
|
|
LOCALAI_NODE_NAME: "agent-worker-1"
|
|
LOCALAI_REGISTRATION_TOKEN: "changeme" # Must match frontend token
|
|
GODEBUG: "netdns=go" # See note in localai service
|
|
volumes:
|
|
- /var/run/docker.sock:/var/run/docker.sock
|
|
depends_on:
|
|
localai:
|
|
condition: service_started
|
|
nats:
|
|
condition: service_started
|
|
|
|
volumes:
|
|
postgres_data:
|
|
frontend_models:
|
|
frontend_data:
|
|
worker_1_models:
|