mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-31 20:21:26 -04:00
Two related fixes in the X-LocalAI-Node middleware wrapper:
1. Replace ml.CheckIsLoaded(modelName).NodeID() with the new
ml.LookupNodeID helper in the lazy resolve closure. CheckIsLoaded
acquires ml.mu and, when the recently-healthy cache window has
expired, runs a gRPC HealthCheck with a 2-minute timeout. Running
that on the response writer right before the first byte hits the
client could stall buffered and streaming responses alike for up to
2 minutes on a stale-healthy model. LookupNodeID is a pure store
read with no I/O and no contention against active inference.
2. Return http.ErrNotSupported (wrapped via fmt.Errorf with %w) from
Hijack when the underlying writer does not implement
http.Hijacker, instead of a string-only errors.New. Matches the
standard library convention so callers using errors.Is - notably
http.NewResponseController.Hijack - detect the condition through
the standard sentinel. Future-proof only: no current routes go
through this branch.
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-7[1m]
132 lines
4.6 KiB
Go
132 lines
4.6 KiB
Go
package middleware
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"net"
|
|
"net/http"
|
|
|
|
"github.com/labstack/echo/v4"
|
|
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
)
|
|
|
|
// NodeHeaderName is the HTTP response header that, when --expose-node-header
|
|
// is enabled, carries the ID of the distributed-mode worker node that served
|
|
// the inference request. Off by default: node IDs reveal internal topology
|
|
// and should not be exposed on a public endpoint.
|
|
const NodeHeaderName = "X-LocalAI-Node"
|
|
|
|
// nodeHeaderWriter wraps an http.ResponseWriter and stamps the X-LocalAI-Node
|
|
// header lazily on the first Write / WriteHeader / Flush call. The lazy
|
|
// resolve is what makes this work for streaming: the picked node ID is only
|
|
// known AFTER ml.Load runs (i.e. on the first SSE chunk), so resolving at
|
|
// request entry would attach the previous request's routing decision (or
|
|
// nothing on a cold cache).
|
|
type nodeHeaderWriter struct {
|
|
http.ResponseWriter
|
|
resolve func() string
|
|
set bool
|
|
}
|
|
|
|
func (w *nodeHeaderWriter) maybeSet() {
|
|
if w.set {
|
|
return
|
|
}
|
|
w.set = true
|
|
if id := w.resolve(); id != "" {
|
|
w.Header().Set(NodeHeaderName, id)
|
|
}
|
|
}
|
|
|
|
func (w *nodeHeaderWriter) Write(b []byte) (int, error) {
|
|
w.maybeSet()
|
|
return w.ResponseWriter.Write(b)
|
|
}
|
|
|
|
func (w *nodeHeaderWriter) WriteHeader(code int) {
|
|
w.maybeSet()
|
|
w.ResponseWriter.WriteHeader(code)
|
|
}
|
|
|
|
// Flush keeps SSE handlers working: Echo's Response.Flush goes through
|
|
// http.NewResponseController which walks Unwrap() chains and invokes Flush
|
|
// on the first wrapper that implements http.Flusher. By implementing it
|
|
// here we both stamp the header before the underlying writer flushes AND
|
|
// keep the streaming path alive.
|
|
func (w *nodeHeaderWriter) Flush() {
|
|
w.maybeSet()
|
|
if f, ok := w.ResponseWriter.(http.Flusher); ok {
|
|
f.Flush()
|
|
}
|
|
}
|
|
|
|
// Hijack preserves WebSocket / raw-conn handlers that need to take over the
|
|
// underlying TCP connection (e.g. /v1/realtime). Without this the wrapper
|
|
// would silently break those endpoints.
|
|
//
|
|
// When the underlying writer does not implement http.Hijacker we return
|
|
// http.ErrNotSupported so callers using errors.Is (notably
|
|
// http.NewResponseController.Hijack) detect the condition through the
|
|
// standard sentinel rather than a string-matched custom error.
|
|
func (w *nodeHeaderWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
|
|
if h, ok := w.ResponseWriter.(http.Hijacker); ok {
|
|
return h.Hijack()
|
|
}
|
|
return nil, nil, fmt.Errorf("hijack not supported: %w", http.ErrNotSupported)
|
|
}
|
|
|
|
// Unwrap lets http.NewResponseController reach through us to find optional
|
|
// interfaces (CloseNotifier, SetReadDeadline, etc.) on the real writer.
|
|
func (w *nodeHeaderWriter) Unwrap() http.ResponseWriter {
|
|
return w.ResponseWriter
|
|
}
|
|
|
|
// ExposeNodeHeader installs a per-request response writer wrapper that
|
|
// stamps the X-LocalAI-Node header from the currently-loaded model's node
|
|
// ID on the first write. Off by default; opted in via --expose-node-header
|
|
// / LOCALAI_EXPOSE_NODE_HEADER. The model name is read from the standard
|
|
// per-request context key set by the request-extractor middleware chain
|
|
// (CONTEXT_LOCALS_KEY_MODEL_NAME), so any handler that goes through the
|
|
// usual SetModelAndConfig wiring is automatically covered.
|
|
//
|
|
// Best-effort: under heavy concurrency for the same model across multiple
|
|
// replicas, the header may reflect a recent routing decision rather than
|
|
// this exact request's, because the model loader's per-modelID store entry
|
|
// is overwritten on every routing decision. Acceptable for observability
|
|
// and debugging.
|
|
func ExposeNodeHeader(appCfg *config.ApplicationConfig, ml *model.ModelLoader) echo.MiddlewareFunc {
|
|
return func(next echo.HandlerFunc) echo.HandlerFunc {
|
|
return func(c echo.Context) error {
|
|
if appCfg == nil || !appCfg.ExposeNodeHeader || ml == nil {
|
|
return next(c)
|
|
}
|
|
orig := c.Response().Writer
|
|
wrapper := &nodeHeaderWriter{
|
|
ResponseWriter: orig,
|
|
resolve: func() string {
|
|
modelName, _ := c.Get(CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
|
|
if modelName == "" {
|
|
return ""
|
|
}
|
|
// Pure store read - never invokes HealthCheck and
|
|
// never acquires ml.mu, so the wrapper cannot stall
|
|
// the response writer for the 2-minute gRPC
|
|
// HealthCheck timeout that CheckIsLoaded can pay
|
|
// when the recently-healthy cache window has
|
|
// expired. The X-LocalAI-Node header is
|
|
// best-effort observability; a stale value is
|
|
// preferable to blocking the byte stream.
|
|
return ml.LookupNodeID(modelName)
|
|
},
|
|
}
|
|
c.Response().Writer = wrapper
|
|
defer func() {
|
|
c.Response().Writer = orig
|
|
}()
|
|
return next(c)
|
|
}
|
|
}
|
|
}
|