mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-31 04:00:05 -04:00
Introduce a per-request Echo middleware that wraps the response writer and lazily stamps X-LocalAI-Node on the first Write / WriteHeader / Flush. This replaces the chan-based per-request rendezvous and per-handler maybeSetNodeHeader calls with a single enforcement point. The wrapper reads the picked node ID by looking up the request's model in the ModelLoader at flush time (late binding), so the value reflects the post-ml.Load state of the loader rather than any pre-route guess. Off by default; gated by ApplicationConfig.ExposeNodeHeader. Ginkgo specs cover off/on, missing model, in-process model (no node ID), absent stash, buffered + streaming flush ordering, error path, and late binding under in-handler stamp. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-7[1m]
123 lines
4.0 KiB
Go
123 lines
4.0 KiB
Go
package middleware
|
|
|
|
import (
|
|
"bufio"
|
|
"errors"
|
|
"net"
|
|
"net/http"
|
|
|
|
"github.com/labstack/echo/v4"
|
|
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
)
|
|
|
|
// NodeHeaderName is the HTTP response header that, when --expose-node-header
|
|
// is enabled, carries the ID of the distributed-mode worker node that served
|
|
// the inference request. Off by default: node IDs reveal internal topology
|
|
// and should not be exposed on a public endpoint.
|
|
const NodeHeaderName = "X-LocalAI-Node"
|
|
|
|
// nodeHeaderWriter wraps an http.ResponseWriter and stamps the X-LocalAI-Node
|
|
// header lazily on the first Write / WriteHeader / Flush call. The lazy
|
|
// resolve is what makes this work for streaming: the picked node ID is only
|
|
// known AFTER ml.Load runs (i.e. on the first SSE chunk), so resolving at
|
|
// request entry would attach the previous request's routing decision (or
|
|
// nothing on a cold cache).
|
|
type nodeHeaderWriter struct {
|
|
http.ResponseWriter
|
|
resolve func() string
|
|
set bool
|
|
}
|
|
|
|
func (w *nodeHeaderWriter) maybeSet() {
|
|
if w.set {
|
|
return
|
|
}
|
|
w.set = true
|
|
if id := w.resolve(); id != "" {
|
|
w.Header().Set(NodeHeaderName, id)
|
|
}
|
|
}
|
|
|
|
func (w *nodeHeaderWriter) Write(b []byte) (int, error) {
|
|
w.maybeSet()
|
|
return w.ResponseWriter.Write(b)
|
|
}
|
|
|
|
func (w *nodeHeaderWriter) WriteHeader(code int) {
|
|
w.maybeSet()
|
|
w.ResponseWriter.WriteHeader(code)
|
|
}
|
|
|
|
// Flush keeps SSE handlers working: Echo's Response.Flush goes through
|
|
// http.NewResponseController which walks Unwrap() chains and invokes Flush
|
|
// on the first wrapper that implements http.Flusher. By implementing it
|
|
// here we both stamp the header before the underlying writer flushes AND
|
|
// keep the streaming path alive.
|
|
func (w *nodeHeaderWriter) Flush() {
|
|
w.maybeSet()
|
|
if f, ok := w.ResponseWriter.(http.Flusher); ok {
|
|
f.Flush()
|
|
}
|
|
}
|
|
|
|
// Hijack preserves WebSocket / raw-conn handlers that need to take over the
|
|
// underlying TCP connection (e.g. /v1/realtime). Without this the wrapper
|
|
// would silently break those endpoints.
|
|
func (w *nodeHeaderWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
|
|
if h, ok := w.ResponseWriter.(http.Hijacker); ok {
|
|
return h.Hijack()
|
|
}
|
|
return nil, nil, errors.New("ResponseWriter does not implement http.Hijacker")
|
|
}
|
|
|
|
// Unwrap lets http.NewResponseController reach through us to find optional
|
|
// interfaces (CloseNotifier, SetReadDeadline, etc.) on the real writer.
|
|
func (w *nodeHeaderWriter) Unwrap() http.ResponseWriter {
|
|
return w.ResponseWriter
|
|
}
|
|
|
|
// ExposeNodeHeader installs a per-request response writer wrapper that
|
|
// stamps the X-LocalAI-Node header from the currently-loaded model's node
|
|
// ID on the first write. Off by default; opted in via --expose-node-header
|
|
// / LOCALAI_EXPOSE_NODE_HEADER. The model name is read from the standard
|
|
// per-request context key set by the request-extractor middleware chain
|
|
// (CONTEXT_LOCALS_KEY_MODEL_NAME), so any handler that goes through the
|
|
// usual SetModelAndConfig wiring is automatically covered.
|
|
//
|
|
// Best-effort: under heavy concurrency for the same model across multiple
|
|
// replicas, the header may reflect a recent routing decision rather than
|
|
// this exact request's, because the model loader's per-modelID store entry
|
|
// is overwritten on every routing decision. Acceptable for observability
|
|
// and debugging.
|
|
func ExposeNodeHeader(appCfg *config.ApplicationConfig, ml *model.ModelLoader) echo.MiddlewareFunc {
|
|
return func(next echo.HandlerFunc) echo.HandlerFunc {
|
|
return func(c echo.Context) error {
|
|
if appCfg == nil || !appCfg.ExposeNodeHeader || ml == nil {
|
|
return next(c)
|
|
}
|
|
orig := c.Response().Writer
|
|
wrapper := &nodeHeaderWriter{
|
|
ResponseWriter: orig,
|
|
resolve: func() string {
|
|
modelName, _ := c.Get(CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
|
|
if modelName == "" {
|
|
return ""
|
|
}
|
|
m := ml.CheckIsLoaded(modelName)
|
|
if m == nil {
|
|
return ""
|
|
}
|
|
return m.NodeID()
|
|
},
|
|
}
|
|
c.Response().Writer = wrapper
|
|
defer func() {
|
|
c.Response().Writer = orig
|
|
}()
|
|
return next(c)
|
|
}
|
|
}
|
|
}
|