Files
LocalAI/pkg/model/model.go
Ettore Di Giacinto b85b7e29df feat(distributed): surface picked node ID via X-LocalAI-Node header
Plumb the SmartRouter's per-request node decision up to the OpenAI
inference handlers (chat, completions, embeddings) and attach it as the
X-LocalAI-Node response header when the operator enabled
--expose-node-header.

Wiring:

- pkg/model.Model gains a NodeID field plus mutex-guarded
  SetNodeID/NodeID accessors. The router stamps it on the *Model it
  returns from NewModelWithClient; the field stays empty for in-process
  loads.
- core/services/nodes/model_router.go SetNodeID after constructing the
  Model so the in-process store carries the most-recent routing
  decision per modelID.
- core/http/endpoints/openai/node_header.go centralizes the policy in
  maybeSetNodeHeader (no-op when the flag is off, the model is not
  loaded, or no node ID is recorded). chat, completion and embeddings
  handlers call it before writing the response.

Best-effort caveat: the distributed LoadModel path overwrites the per
modelID store entry on every routing decision, so under heavy
concurrency the header reflects "a recent decision" rather than "the
exact node that served this exact request". This is acceptable for
observability and matches what operators already see in the cluster
logs. Documented in the flag help text and in the distributed-mode
feature doc.

Assisted-by: Claude:claude-opus-4-7[1m]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-24 20:14:02 +00:00

101 lines
2.7 KiB
Go

package model
import (
"sync"
"time"
grpc "github.com/mudler/LocalAI/pkg/grpc"
process "github.com/mudler/go-processmanager"
)
// healthCheckTTL is the duration for which a successful health check is cached.
// Subsequent checkIsLoaded calls within this window skip the gRPC round-trip,
// avoiding serialization of concurrent requests behind ml.mu.Lock().
const healthCheckTTL = 30 * time.Second
type Model struct {
ID string `json:"id"`
address string
client grpc.Backend
process *process.Process
lastHealthCheck time.Time
// nodeID is the ID of the distributed-mode worker node that owns this
// model handle, when set. Empty for in-process models. Best-effort:
// because the distributed LoadModel path overwrites the per-modelID
// store entry on every routing decision, this value reflects the
// most-recently-routed node for the model, not necessarily the node
// that served a specific in-flight request. Used by the optional
// X-LocalAI-Node response header (--expose-node-header).
nodeID string
sync.Mutex
}
func NewModel(ID, address string, process *process.Process) *Model {
return &Model{
ID: ID,
address: address,
process: process,
}
}
// NewModelWithClient creates a Model with a pre-configured gRPC client.
// Used in distributed mode where the client is wrapped with file staging.
func NewModelWithClient(ID, address string, client grpc.Backend) *Model {
return &Model{
ID: ID,
address: address,
client: client,
}
}
// SetNodeID records the distributed-mode worker node that owns this model
// handle. Safe to call from any goroutine.
func (m *Model) SetNodeID(id string) {
m.Lock()
defer m.Unlock()
m.nodeID = id
}
// NodeID returns the distributed-mode worker node ID associated with this
// model handle, or "" if unknown / in-process. See the nodeID field comment
// for the best-effort caveat.
func (m *Model) NodeID() string {
m.Lock()
defer m.Unlock()
return m.nodeID
}
func (m *Model) Process() *process.Process {
return m.process
}
// IsRecentlyHealthy returns true if the model passed a health check within the TTL.
func (m *Model) IsRecentlyHealthy() bool {
m.Lock()
defer m.Unlock()
return !m.lastHealthCheck.IsZero() && time.Since(m.lastHealthCheck) < healthCheckTTL
}
// MarkHealthy records the current time as the last successful health check.
func (m *Model) MarkHealthy() {
m.Lock()
defer m.Unlock()
m.lastHealthCheck = time.Now()
}
func (m *Model) GRPC(parallel bool, wd *WatchDog) grpc.Backend {
if m.client != nil {
return m.client
}
enableWD := false
if wd != nil {
enableWD = true
}
m.Lock()
defer m.Unlock()
m.client = grpc.NewClient(m.address, parallel, wd, enableWD)
return m.client
}