LocalAI/docker-compose.distributed.yaml

# Docker Compose for LocalAI Distributed Mode
#
# Starts a full distributed stack: PostgreSQL, NATS, a LocalAI frontend,
# and one llama-cpp backend node.
#
# Model files are transferred from the frontend to backend nodes via HTTP
# — no shared volumes needed between frontend and backends.
#
# Usage:
#   docker compose -f docker-compose.distributed.yaml up
#
# See docs: https://localai.io/features/distributed-mode/

services:
  # --- Infrastructure ---

  postgres:
    image: quay.io/mudler/localrecall:v0.5.5-postgresql  # PostgreSQL with pgvector
    environment:
      POSTGRES_DB: localai
      POSTGRES_USER: localai
      POSTGRES_PASSWORD: localai
    volumes:
      - postgres_data:/var/lib/postgresql
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U localai"]
      interval: 5s
      timeout: 3s
      retries: 10

  nats:
    image: nats:2-alpine
    ports:
      - "4222:4222"   # Client connections
      - "8222:8222"   # HTTP monitoring (optional, useful for debugging)
    command: ["--js", "-m", "8222"]  # Enable JetStream + monitoring

  # --- LocalAI Frontend ---
  # Stateless API server that routes requests to backend nodes.
  # Add more replicas behind a load balancer for HA.

  localai:
    # image: localai/localai:latest-cpu
    build:
      context: .
      dockerfile: Dockerfile
      args:
        - IMAGE_TYPE=core
        - BASE_IMAGE=ubuntu:24.04
    ports:
      - "8080:8080"
    environment:
      # Distributed mode
      LOCALAI_DISTRIBUTED: "true"
      LOCALAI_NATS_URL: "nats://nats:4222"
      LOCALAI_AGENT_POOL_EMBEDDING_MODEL: "granite-embedding-107m-multilingual"
      LOCALAI_AGENT_POOL_VECTOR_ENGINE: "postgres"
      LOCALAI_AGENT_POOL_DATABASE_URL: "postgresql://localai:localai@postgres:5432/localai?sslmode=disable"
      LOCALAI_REGISTRATION_TOKEN: "changeme"  # Change this in production!
      # Shared-models mode (optional): set when every node mounts the SAME
      # models directory at the SAME path (see "Shared Volume Mode" below).
      # The router then skips gRPC file staging and workers load models
      # directly from the shared volume instead of re-downloading them.
      # LOCALAI_DISTRIBUTED_SHARED_MODELS: "true"
      # Auth (required for distributed mode — must use PostgreSQL)
      LOCALAI_AUTH: "true"
      LOCALAI_AUTH_DATABASE_URL: "postgresql://localai:localai@postgres:5432/localai?sslmode=disable"
      # Force pure-Go DNS resolver. The default cgo resolver follows the
      # container's nsswitch.conf and ends up forwarding to host
      # systemd-resolved (127.0.0.53), which isn't reachable from inside
      # the container — failing every postgres/nats hostname lookup at
      # boot. The pure-Go path reads /etc/resolv.conf directly and uses
      # Docker's embedded DNS at 127.0.0.11.
      GODEBUG: "netdns=go"
      # Paths
      MODELS_PATH: /models
    volumes:
      - frontend_models:/models
      - frontend_data:/data
    depends_on:
      postgres:
        condition: service_healthy
      nats:
        condition: service_started

  # --- Worker Node ---
  # A generic worker that self-registers with the frontend.
  # The same LocalAI image is used — no separate image needed.
  # The SmartRouter dynamically tells workers which backend to install via NATS.
  #
  # Model files are transferred from the frontend via HTTP file staging.
  # The worker has its own independent models volume.

  worker-1:
    # image: localai/localai:latest-cpu
    build:
      context: .
      dockerfile: Dockerfile
      args:
        - IMAGE_TYPE=core
        - BASE_IMAGE=ubuntu:24.04
    command:
      - worker
    # The image's default HEALTHCHECK targets the server's /readyz on 8080.
    # Workers don't run the OpenAI API server — their HTTP file transfer
    # server runs on the gRPC base port - 1 (50050 here) and exposes /readyz.
    # Override the env var so the inherited HEALTHCHECK probes the right port.
    environment:
      HEALTHCHECK_ENDPOINT: "http://localhost:50050/readyz"
      LOCALAI_SERVE_ADDR: "0.0.0.0:50051"
      LOCALAI_ADVERTISE_ADDR: "worker-1:50051"
      LOCALAI_ADVERTISE_HTTP_ADDR: "worker-1:50050"
      DEBUG: "true"
      LOCALAI_REGISTER_TO: "http://localai:8080"
      LOCALAI_NODE_NAME: "worker-1"
      LOCALAI_REGISTRATION_TOKEN: "changeme"  # Must match frontend token
      LOCALAI_HEARTBEAT_INTERVAL: "10s"
      LOCALAI_NATS_URL: "nats://nats:4222"
      GODEBUG: "netdns=go"  # See note in localai service
      MODELS_PATH: /models
    volumes:
      - worker_1_models:/models
    depends_on:
      localai:
        condition: service_started
      nats:
        condition: service_started

    # --- GPU Support (NVIDIA) ---
    # Uncomment the following and change the image to a CUDA variant
    # (e.g., localai/localai:latest-gpu-nvidia-cuda-12) to enable GPU.
    #
    # NVIDIA_DRIVER_CAPABILITIES must include `utility` so nvidia-smi / NVML
    # are available inside the container; without it the worker cannot report
    # free VRAM and the Nodes page will show 0 free / total used.
    # `init: true` avoids zombie-reap races that make nvidia-smi flaky.
    #
    # init: true
    # environment:
    #   NVIDIA_DRIVER_CAPABILITIES: "compute,utility"
    # deploy:
    #   resources:
    #     reservations:
    #       devices:
    #         - driver: nvidia.com/gpu
    #           count: all
    #           capabilities: [gpu, utility]

  # --- Shared Volume Mode (optional) ---
  # If all services run on the same Docker host, you can skip gRPC file transfer
  # by sharing a single models volume. Replace the volumes above with:
  #
  #   localai:
  #     volumes:
  #       - shared_models:/models
  #       - frontend_data:/data
  #
  #   backend-llama-cpp:
  #     volumes:
  #       - shared_models:/models
  #
  # Then add to the volumes section:
  #   shared_models:
  #
  # With shared volumes the model files are already present on every worker at
  # the same path. Set LOCALAI_DISTRIBUTED_SHARED_MODELS=true on the frontend
  # (see its environment above) so the router skips gRPC file staging and the
  # worker loads the model directly from the shared path instead of
  # re-downloading it into a per-model subdirectory.

  # --- Adding More Workers ---
  # Copy the worker-1 service above and change:
  #   - Service name (e.g., worker-2)
  #   - LOCALAI_NODE_NAME (must be unique)
  #   - LOCALAI_ADVERTISE_ADDR (must match service name)
  #
  # Workers are generic — no backend type needed. The SmartRouter
  # will dynamically install the required backend via NATS when
  # a model request arrives.

  # --- Agent Worker ---
  # Dedicated process for agent chat execution.
  # Receives chat jobs from NATS, runs cogito LLM calls via the LocalAI API,
  # and publishes results back via NATS for SSE delivery.
  # No database access needed — config and skills are sent in the NATS payload.

  agent-worker-1:
    # image: localai/localai:latest-cpu
    build:
      context: .
      dockerfile: Dockerfile
      args:
        - IMAGE_TYPE=core
        - BASE_IMAGE=ubuntu:24.04
    # Install Docker CLI and start agent-worker.
    # The Docker socket is mounted from the host so that MCP stdio servers
    # using "docker run" commands can spawn containers on the host Docker.
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        apt-get update -qq && apt-get install -y -qq docker.io >/dev/null 2>&1
        exec /entrypoint.sh agent-worker
    # The agent worker is NATS-only — no HTTP server to probe. Disable the
    # image's inherited HEALTHCHECK so the container doesn't show unhealthy.
    healthcheck:
      disable: true
    environment:
      LOCALAI_NATS_URL: "nats://nats:4222"
      LOCALAI_REGISTER_TO: "http://localai:8080"
      LOCALAI_NODE_NAME: "agent-worker-1"
      LOCALAI_REGISTRATION_TOKEN: "changeme"  # Must match frontend token
      GODEBUG: "netdns=go"  # See note in localai service
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
    depends_on:
      localai:
        condition: service_started
      nats:
        condition: service_started

volumes:
  postgres_data:
  frontend_models:
  frontend_data:
  worker_1_models: