mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-27 18:06:58 -04:00
fix(distributed): broadcast admin model-config changes across replicas (#10540)
In distributed mode the admin model endpoints (/models/edit, /models/import, /models/toggle-state and the PATCH config-json endpoint) wrote the YAML to the shared models dir but reloaded only the local replica's in-memory ModelConfigLoader. With multiple frontend replicas behind one service, a save landed on whichever replica handled the request; peers kept serving their stale in-memory view, so a load-balanced request was a coin-flip between old and new config (a created alias visible on one replica and missing on the other, an edited alias target diverging, etc.). The NATS cache-invalidation channel (SubjectCacheInvalidateModels + OnModelsChanged) already existed for the gallery install/delete path; these admin endpoints simply never published on it. Wire them up via a new GalleryService.BroadcastModelsChanged helper (no-op in standalone mode). Also fix delete propagation: LoadModelConfigsFromPath is additive and never drops an entry whose file is gone, so the subscriber hook (which only reloaded from disk) could not propagate a removal. ApplyRemoteChange now honors the event op - pruning the element on "delete" and reloading otherwise - and shuts down any running instance of the affected model so the new config takes effect. This closes the same latent gap on the gallery delete path. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
53
core/services/modeladmin/remote_sync.go
Normal file
53
core/services/modeladmin/remote_sync.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package modeladmin
|
||||
|
||||
import (
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/services/messaging"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
// opDelete is the CacheInvalidateEvent.Op value the gallery delete path and the
|
||||
// admin delete endpoint use; a delete must prune (a reload-from-path cannot).
|
||||
const opDelete = "delete"
|
||||
|
||||
// ApplyRemoteChange refreshes this replica's in-memory model state from a peer
|
||||
// replica's model-config change broadcast (messaging.CacheInvalidateEvent on
|
||||
// SubjectCacheInvalidateModels). It is the subscriber-side counterpart to
|
||||
// GalleryService.BroadcastModelsChanged.
|
||||
//
|
||||
// The op matters because LoadModelConfigsFromPath is additive: it loads every
|
||||
// YAML on disk into the loader but never removes an entry whose file is gone.
|
||||
// So a delete cannot be propagated by a plain reload - the deleted element must
|
||||
// be explicitly pruned. Specifically:
|
||||
//
|
||||
// - op == "delete" with a named element: prune that element from the loader.
|
||||
// - otherwise: reload all configs from disk (picks up creates and edits).
|
||||
//
|
||||
// In both cases, when an element is named, any running instance on this replica
|
||||
// is shut down (best-effort) so the next request rebuilds it from the new
|
||||
// config instead of serving the stale one - mirroring what the originating
|
||||
// replica does on a local edit/delete.
|
||||
//
|
||||
// ml may be nil (no running instances to shut down). modelsPath and opts are
|
||||
// forwarded to LoadModelConfigsFromPath.
|
||||
func ApplyRemoteChange(cl *config.ModelConfigLoader, ml *model.ModelLoader, modelsPath string, evt messaging.CacheInvalidateEvent, opts ...config.ConfigLoaderOption) error {
|
||||
if evt.Op == opDelete && evt.Element != "" {
|
||||
cl.RemoveModelConfig(evt.Element)
|
||||
} else if err := cl.LoadModelConfigsFromPath(modelsPath, opts...); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Drop any running instance of the affected model so the next request
|
||||
// rebuilds it from the refreshed config instead of serving the stale one.
|
||||
// Best-effort: the model may not be loaded on this replica, which surfaces
|
||||
// as a benign error here.
|
||||
if ml != nil && evt.Element != "" {
|
||||
if err := ml.ShutdownModel(evt.Element); err != nil {
|
||||
xlog.Debug("ApplyRemoteChange: could not shut down model instance (likely not loaded)",
|
||||
"model", evt.Element, "error", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
80
core/services/modeladmin/remote_sync_test.go
Normal file
80
core/services/modeladmin/remote_sync_test.go
Normal file
@@ -0,0 +1,80 @@
|
||||
package modeladmin
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
"gopkg.in/yaml.v3"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/services/messaging"
|
||||
)
|
||||
|
||||
var _ = Describe("ApplyRemoteChange", func() {
|
||||
var (
|
||||
dir string
|
||||
loader *config.ModelConfigLoader
|
||||
)
|
||||
|
||||
BeforeEach(func() {
|
||||
dir = GinkgoT().TempDir()
|
||||
loader = config.NewModelConfigLoader(dir)
|
||||
})
|
||||
|
||||
writeYAML := func(name string, body map[string]any) {
|
||||
body["name"] = name
|
||||
data, err := yaml.Marshal(body)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(filepath.Join(dir, name+".yaml"), data, 0644)).To(Succeed())
|
||||
}
|
||||
|
||||
It("loads a peer-created config from disk on an install event", func() {
|
||||
// Peer wrote the YAML to the shared models dir; this replica has not
|
||||
// loaded it yet (empty in-memory loader).
|
||||
writeYAML("peer-alias", map[string]any{"alias": "qwen"})
|
||||
_, ok := loader.GetModelConfig("peer-alias")
|
||||
Expect(ok).To(BeFalse(), "precondition: not yet in memory")
|
||||
|
||||
err := ApplyRemoteChange(loader, nil, dir, messaging.CacheInvalidateEvent{
|
||||
Element: "peer-alias", Op: "install",
|
||||
})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
_, ok = loader.GetModelConfig("peer-alias")
|
||||
Expect(ok).To(BeTrue(), "install event must reload the new config from disk")
|
||||
})
|
||||
|
||||
It("prunes a peer-deleted config that a reload-from-path cannot drop", func() {
|
||||
// Model is present in memory (loaded earlier) but its file is now gone
|
||||
// from the shared dir. LoadModelConfigsFromPath is additive, so only an
|
||||
// explicit prune can remove it - this is the cross-replica delete bug.
|
||||
writeYAML("doomed", map[string]any{"alias": "qwen"})
|
||||
Expect(loader.LoadModelConfigsFromPath(dir)).To(Succeed())
|
||||
_, ok := loader.GetModelConfig("doomed")
|
||||
Expect(ok).To(BeTrue(), "precondition: in memory")
|
||||
Expect(os.Remove(filepath.Join(dir, "doomed.yaml"))).To(Succeed())
|
||||
|
||||
err := ApplyRemoteChange(loader, nil, dir, messaging.CacheInvalidateEvent{
|
||||
Element: "doomed", Op: "delete",
|
||||
})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
_, ok = loader.GetModelConfig("doomed")
|
||||
Expect(ok).To(BeFalse(), "delete event must prune the element from memory")
|
||||
})
|
||||
|
||||
It("does a full reload when no element is named", func() {
|
||||
writeYAML("m1", map[string]any{"alias": "qwen"})
|
||||
writeYAML("m2", map[string]any{"alias": "qwen"})
|
||||
|
||||
err := ApplyRemoteChange(loader, nil, dir, messaging.CacheInvalidateEvent{})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
_, ok1 := loader.GetModelConfig("m1")
|
||||
_, ok2 := loader.GetModelConfig("m2")
|
||||
Expect(ok1).To(BeTrue())
|
||||
Expect(ok2).To(BeTrue())
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user