LocalAI/core/http/middleware/node_header.go

package middleware

import (
	"bufio"
	"fmt"
	"net"
	"net/http"

	"github.com/labstack/echo/v4"

	"github.com/mudler/LocalAI/core/config"
	"github.com/mudler/LocalAI/pkg/model"
)

// NodeHeaderName is the HTTP response header that, when --expose-node-header
// is enabled, carries the ID of the distributed-mode worker node that served
// the inference request. Off by default: node IDs reveal internal topology
// and should not be exposed on a public endpoint.
const NodeHeaderName = "X-LocalAI-Node"

// nodeHeaderWriter wraps an http.ResponseWriter and stamps the X-LocalAI-Node
// header lazily on the first Write / WriteHeader / Flush call. The lazy
// resolve is what makes this work for streaming: the picked node ID is only
// known AFTER ml.Load runs (i.e. on the first SSE chunk), so resolving at
// request entry would attach the previous request's routing decision (or
// nothing on a cold cache).
type nodeHeaderWriter struct {
	http.ResponseWriter
	resolve func() string
	set     bool
}

func (w *nodeHeaderWriter) maybeSet() {
	if w.set {
		return
	}
	w.set = true
	if id := w.resolve(); id != "" {
		w.Header().Set(NodeHeaderName, id)
	}
}

func (w *nodeHeaderWriter) Write(b []byte) (int, error) {
	w.maybeSet()
	return w.ResponseWriter.Write(b)
}

func (w *nodeHeaderWriter) WriteHeader(code int) {
	w.maybeSet()
	w.ResponseWriter.WriteHeader(code)
}

// Flush keeps SSE handlers working: Echo's Response.Flush goes through
// http.NewResponseController which walks Unwrap() chains and invokes Flush
// on the first wrapper that implements http.Flusher. By implementing it
// here we both stamp the header before the underlying writer flushes AND
// keep the streaming path alive.
func (w *nodeHeaderWriter) Flush() {
	w.maybeSet()
	if f, ok := w.ResponseWriter.(http.Flusher); ok {
		f.Flush()
	}
}

// Hijack preserves WebSocket / raw-conn handlers that need to take over the
// underlying TCP connection (e.g. /v1/realtime). Without this the wrapper
// would silently break those endpoints.
//
// When the underlying writer does not implement http.Hijacker we return
// http.ErrNotSupported so callers using errors.Is (notably
// http.NewResponseController.Hijack) detect the condition through the
// standard sentinel rather than a string-matched custom error.
func (w *nodeHeaderWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
	if h, ok := w.ResponseWriter.(http.Hijacker); ok {
		return h.Hijack()
	}
	return nil, nil, fmt.Errorf("hijack not supported: %w", http.ErrNotSupported)
}

// Unwrap lets http.NewResponseController reach through us to find optional
// interfaces (CloseNotifier, SetReadDeadline, etc.) on the real writer.
func (w *nodeHeaderWriter) Unwrap() http.ResponseWriter {
	return w.ResponseWriter
}

// ExposeNodeHeader installs a per-request response writer wrapper that
// stamps the X-LocalAI-Node header from the currently-loaded model's node
// ID on the first write. Off by default; opted in via --expose-node-header
// / LOCALAI_EXPOSE_NODE_HEADER. The model name is read from the standard
// per-request context key set by the request-extractor middleware chain
// (CONTEXT_LOCALS_KEY_MODEL_NAME), so any handler that goes through the
// usual SetModelAndConfig wiring is automatically covered.
//
// Best-effort: under heavy concurrency for the same model across multiple
// replicas, the header may reflect a recent routing decision rather than
// this exact request's, because the model loader's per-modelID store entry
// is overwritten on every routing decision. Acceptable for observability
// and debugging.
func ExposeNodeHeader(appCfg *config.ApplicationConfig, ml *model.ModelLoader) echo.MiddlewareFunc {
	return func(next echo.HandlerFunc) echo.HandlerFunc {
		return func(c echo.Context) error {
			if appCfg == nil || !appCfg.ExposeNodeHeader || ml == nil {
				return next(c)
			}
			orig := c.Response().Writer
			wrapper := &nodeHeaderWriter{
				ResponseWriter: orig,
				resolve: func() string {
					modelName, _ := c.Get(CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
					if modelName == "" {
						return ""
					}
					// Pure store read - never invokes HealthCheck and
					// never acquires ml.mu, so the wrapper cannot stall
					// the response writer for the 2-minute gRPC
					// HealthCheck timeout that CheckIsLoaded can pay
					// when the recently-healthy cache window has
					// expired. The X-LocalAI-Node header is
					// best-effort observability; a stale value is
					// preferable to blocking the byte stream.
					return ml.LookupNodeID(modelName)
				},
			}
			c.Response().Writer = wrapper
			defer func() {
				c.Response().Writer = orig
			}()
			return next(c)
		}
	}
}