Files
LocalAI/core/http/middleware/node_header.go
Ettore Di Giacinto 8b2697f39a fix(distributed): drop hot-path I/O from node-header wrapper
Two related fixes in the X-LocalAI-Node middleware wrapper:

  1. Replace ml.CheckIsLoaded(modelName).NodeID() with the new
     ml.LookupNodeID helper in the lazy resolve closure. CheckIsLoaded
     acquires ml.mu and, when the recently-healthy cache window has
     expired, runs a gRPC HealthCheck with a 2-minute timeout. Running
     that on the response writer right before the first byte hits the
     client could stall buffered and streaming responses alike for up to
     2 minutes on a stale-healthy model. LookupNodeID is a pure store
     read with no I/O and no contention against active inference.

  2. Return http.ErrNotSupported (wrapped via fmt.Errorf with %w) from
     Hijack when the underlying writer does not implement
     http.Hijacker, instead of a string-only errors.New. Matches the
     standard library convention so callers using errors.Is - notably
     http.NewResponseController.Hijack - detect the condition through
     the standard sentinel. Future-proof only: no current routes go
     through this branch.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-7[1m]
2026-05-24 21:40:57 +00:00

132 lines
4.6 KiB
Go

package middleware
import (
"bufio"
"fmt"
"net"
"net/http"
"github.com/labstack/echo/v4"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/pkg/model"
)
// NodeHeaderName is the HTTP response header that, when --expose-node-header
// is enabled, carries the ID of the distributed-mode worker node that served
// the inference request. Off by default: node IDs reveal internal topology
// and should not be exposed on a public endpoint.
const NodeHeaderName = "X-LocalAI-Node"
// nodeHeaderWriter wraps an http.ResponseWriter and stamps the X-LocalAI-Node
// header lazily on the first Write / WriteHeader / Flush call. The lazy
// resolve is what makes this work for streaming: the picked node ID is only
// known AFTER ml.Load runs (i.e. on the first SSE chunk), so resolving at
// request entry would attach the previous request's routing decision (or
// nothing on a cold cache).
type nodeHeaderWriter struct {
http.ResponseWriter
resolve func() string
set bool
}
func (w *nodeHeaderWriter) maybeSet() {
if w.set {
return
}
w.set = true
if id := w.resolve(); id != "" {
w.Header().Set(NodeHeaderName, id)
}
}
func (w *nodeHeaderWriter) Write(b []byte) (int, error) {
w.maybeSet()
return w.ResponseWriter.Write(b)
}
func (w *nodeHeaderWriter) WriteHeader(code int) {
w.maybeSet()
w.ResponseWriter.WriteHeader(code)
}
// Flush keeps SSE handlers working: Echo's Response.Flush goes through
// http.NewResponseController which walks Unwrap() chains and invokes Flush
// on the first wrapper that implements http.Flusher. By implementing it
// here we both stamp the header before the underlying writer flushes AND
// keep the streaming path alive.
func (w *nodeHeaderWriter) Flush() {
w.maybeSet()
if f, ok := w.ResponseWriter.(http.Flusher); ok {
f.Flush()
}
}
// Hijack preserves WebSocket / raw-conn handlers that need to take over the
// underlying TCP connection (e.g. /v1/realtime). Without this the wrapper
// would silently break those endpoints.
//
// When the underlying writer does not implement http.Hijacker we return
// http.ErrNotSupported so callers using errors.Is (notably
// http.NewResponseController.Hijack) detect the condition through the
// standard sentinel rather than a string-matched custom error.
func (w *nodeHeaderWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
if h, ok := w.ResponseWriter.(http.Hijacker); ok {
return h.Hijack()
}
return nil, nil, fmt.Errorf("hijack not supported: %w", http.ErrNotSupported)
}
// Unwrap lets http.NewResponseController reach through us to find optional
// interfaces (CloseNotifier, SetReadDeadline, etc.) on the real writer.
func (w *nodeHeaderWriter) Unwrap() http.ResponseWriter {
return w.ResponseWriter
}
// ExposeNodeHeader installs a per-request response writer wrapper that
// stamps the X-LocalAI-Node header from the currently-loaded model's node
// ID on the first write. Off by default; opted in via --expose-node-header
// / LOCALAI_EXPOSE_NODE_HEADER. The model name is read from the standard
// per-request context key set by the request-extractor middleware chain
// (CONTEXT_LOCALS_KEY_MODEL_NAME), so any handler that goes through the
// usual SetModelAndConfig wiring is automatically covered.
//
// Best-effort: under heavy concurrency for the same model across multiple
// replicas, the header may reflect a recent routing decision rather than
// this exact request's, because the model loader's per-modelID store entry
// is overwritten on every routing decision. Acceptable for observability
// and debugging.
func ExposeNodeHeader(appCfg *config.ApplicationConfig, ml *model.ModelLoader) echo.MiddlewareFunc {
return func(next echo.HandlerFunc) echo.HandlerFunc {
return func(c echo.Context) error {
if appCfg == nil || !appCfg.ExposeNodeHeader || ml == nil {
return next(c)
}
orig := c.Response().Writer
wrapper := &nodeHeaderWriter{
ResponseWriter: orig,
resolve: func() string {
modelName, _ := c.Get(CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
if modelName == "" {
return ""
}
// Pure store read - never invokes HealthCheck and
// never acquires ml.mu, so the wrapper cannot stall
// the response writer for the 2-minute gRPC
// HealthCheck timeout that CheckIsLoaded can pay
// when the recently-healthy cache window has
// expired. The X-LocalAI-Node header is
// best-effort observability; a stale value is
// preferable to blocking the byte stream.
return ml.LookupNodeID(modelName)
},
}
c.Response().Writer = wrapper
defer func() {
c.Response().Writer = orig
}()
return next(c)
}
}
}