mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-30 19:47:47 -04:00
Plumb the SmartRouter's per-request node decision up to the OpenAI inference handlers (chat, completions, embeddings) and attach it as the X-LocalAI-Node response header when the operator enabled --expose-node-header. Wiring: - pkg/model.Model gains a NodeID field plus mutex-guarded SetNodeID/NodeID accessors. The router stamps it on the *Model it returns from NewModelWithClient; the field stays empty for in-process loads. - core/services/nodes/model_router.go SetNodeID after constructing the Model so the in-process store carries the most-recent routing decision per modelID. - core/http/endpoints/openai/node_header.go centralizes the policy in maybeSetNodeHeader (no-op when the flag is off, the model is not loaded, or no node ID is recorded). chat, completion and embeddings handlers call it before writing the response. Best-effort caveat: the distributed LoadModel path overwrites the per modelID store entry on every routing decision, so under heavy concurrency the header reflects "a recent decision" rather than "the exact node that served this exact request". This is acceptable for observability and matches what operators already see in the cluster logs. Documented in the flag help text and in the distributed-mode feature doc. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
101 lines
2.7 KiB
Go
101 lines
2.7 KiB
Go
package model
|
|
|
|
import (
|
|
"sync"
|
|
"time"
|
|
|
|
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
|
process "github.com/mudler/go-processmanager"
|
|
)
|
|
|
|
// healthCheckTTL is the duration for which a successful health check is cached.
|
|
// Subsequent checkIsLoaded calls within this window skip the gRPC round-trip,
|
|
// avoiding serialization of concurrent requests behind ml.mu.Lock().
|
|
const healthCheckTTL = 30 * time.Second
|
|
|
|
type Model struct {
|
|
ID string `json:"id"`
|
|
address string
|
|
client grpc.Backend
|
|
process *process.Process
|
|
lastHealthCheck time.Time
|
|
// nodeID is the ID of the distributed-mode worker node that owns this
|
|
// model handle, when set. Empty for in-process models. Best-effort:
|
|
// because the distributed LoadModel path overwrites the per-modelID
|
|
// store entry on every routing decision, this value reflects the
|
|
// most-recently-routed node for the model, not necessarily the node
|
|
// that served a specific in-flight request. Used by the optional
|
|
// X-LocalAI-Node response header (--expose-node-header).
|
|
nodeID string
|
|
sync.Mutex
|
|
}
|
|
|
|
func NewModel(ID, address string, process *process.Process) *Model {
|
|
return &Model{
|
|
ID: ID,
|
|
address: address,
|
|
process: process,
|
|
}
|
|
}
|
|
|
|
// NewModelWithClient creates a Model with a pre-configured gRPC client.
|
|
// Used in distributed mode where the client is wrapped with file staging.
|
|
func NewModelWithClient(ID, address string, client grpc.Backend) *Model {
|
|
return &Model{
|
|
ID: ID,
|
|
address: address,
|
|
client: client,
|
|
}
|
|
}
|
|
|
|
// SetNodeID records the distributed-mode worker node that owns this model
|
|
// handle. Safe to call from any goroutine.
|
|
func (m *Model) SetNodeID(id string) {
|
|
m.Lock()
|
|
defer m.Unlock()
|
|
m.nodeID = id
|
|
}
|
|
|
|
// NodeID returns the distributed-mode worker node ID associated with this
|
|
// model handle, or "" if unknown / in-process. See the nodeID field comment
|
|
// for the best-effort caveat.
|
|
func (m *Model) NodeID() string {
|
|
m.Lock()
|
|
defer m.Unlock()
|
|
return m.nodeID
|
|
}
|
|
|
|
func (m *Model) Process() *process.Process {
|
|
return m.process
|
|
}
|
|
|
|
// IsRecentlyHealthy returns true if the model passed a health check within the TTL.
|
|
func (m *Model) IsRecentlyHealthy() bool {
|
|
m.Lock()
|
|
defer m.Unlock()
|
|
return !m.lastHealthCheck.IsZero() && time.Since(m.lastHealthCheck) < healthCheckTTL
|
|
}
|
|
|
|
// MarkHealthy records the current time as the last successful health check.
|
|
func (m *Model) MarkHealthy() {
|
|
m.Lock()
|
|
defer m.Unlock()
|
|
m.lastHealthCheck = time.Now()
|
|
}
|
|
|
|
func (m *Model) GRPC(parallel bool, wd *WatchDog) grpc.Backend {
|
|
if m.client != nil {
|
|
return m.client
|
|
}
|
|
|
|
enableWD := false
|
|
if wd != nil {
|
|
enableWD = true
|
|
}
|
|
|
|
m.Lock()
|
|
defer m.Unlock()
|
|
m.client = grpc.NewClient(m.address, parallel, wd, enableWD)
|
|
return m.client
|
|
}
|