mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-27 09:57:14 -04:00
fix(distributed): broadcast admin model-config changes across replicas (#10540)
In distributed mode the admin model endpoints (/models/edit, /models/import, /models/toggle-state and the PATCH config-json endpoint) wrote the YAML to the shared models dir but reloaded only the local replica's in-memory ModelConfigLoader. With multiple frontend replicas behind one service, a save landed on whichever replica handled the request; peers kept serving their stale in-memory view, so a load-balanced request was a coin-flip between old and new config (a created alias visible on one replica and missing on the other, an edited alias target diverging, etc.). The NATS cache-invalidation channel (SubjectCacheInvalidateModels + OnModelsChanged) already existed for the gallery install/delete path; these admin endpoints simply never published on it. Wire them up via a new GalleryService.BroadcastModelsChanged helper (no-op in standalone mode). Also fix delete propagation: LoadModelConfigsFromPath is additive and never drops an entry whose file is gone, so the subscriber hook (which only reloaded from disk) could not propagate a removal. ApplyRemoteChange now honors the event op - pruning the element on "delete" and reloading otherwise - and shuts down any running instance of the affected model so the new config takes effect. This closes the same latent gap on the gallery delete path. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -404,6 +404,36 @@ var _ = Describe("GalleryService cache invalidation broadcasts", func() {
|
||||
Element: "x", Op: "install",
|
||||
})).To(Succeed())
|
||||
})
|
||||
|
||||
It("BroadcastModelsChanged delivers the element and op to a peer's OnModelsChanged", func() {
|
||||
var (
|
||||
mu sync.Mutex
|
||||
seen []messaging.CacheInvalidateEvent
|
||||
)
|
||||
svcB.OnModelsChanged = func(evt messaging.CacheInvalidateEvent) {
|
||||
mu.Lock()
|
||||
seen = append(seen, evt)
|
||||
mu.Unlock()
|
||||
}
|
||||
Expect(svcA.SubscribeBroadcasts()).To(Succeed())
|
||||
Expect(svcB.SubscribeBroadcasts()).To(Succeed())
|
||||
|
||||
// An admin edit on replica A must reach replica B over the same subject
|
||||
// the gallery path uses, so B refreshes its in-memory config loader.
|
||||
svcA.BroadcastModelsChanged("my-alias", "install")
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
Expect(seen).To(ContainElement(messaging.CacheInvalidateEvent{
|
||||
Element: "my-alias", Op: "install",
|
||||
}))
|
||||
})
|
||||
|
||||
It("BroadcastModelsChanged is a no-op when NATS is not wired (standalone)", func() {
|
||||
standalone := galleryop.NewGalleryService(&config.ApplicationConfig{}, nil)
|
||||
// No SetNATSClient: must not panic and must simply do nothing.
|
||||
Expect(func() { standalone.BroadcastModelsChanged("x", "delete") }).ToNot(Panic())
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("GalleryService PostgreSQL hydration", func() {
|
||||
|
||||
@@ -201,6 +201,24 @@ func (g *GalleryService) publishCacheInvalidate(subject string, evt messaging.Ca
|
||||
}
|
||||
}
|
||||
|
||||
// BroadcastModelsChanged notifies peer replicas that a model config was
|
||||
// created, edited, or removed out-of-band of the gallery install/delete
|
||||
// channel (e.g. the admin /models/edit, /models/import and
|
||||
// /models/toggle-state endpoints, which write the YAML and reload only the
|
||||
// local in-memory loader). Peers receive it via OnModelsChanged and refresh
|
||||
// their own ModelConfigLoader so a request load-balanced to any replica sees
|
||||
// the same config. No-op in standalone mode (no NATS client).
|
||||
//
|
||||
// op is "install" for a create/edit (the element must be (re)loaded from
|
||||
// disk) or "delete" for a removal (the element must be pruned from memory,
|
||||
// which a reload-from-path cannot do because the loader is additive).
|
||||
func (g *GalleryService) BroadcastModelsChanged(element, op string) {
|
||||
g.publishCacheInvalidate(messaging.SubjectCacheInvalidateModels, messaging.CacheInvalidateEvent{
|
||||
Element: element,
|
||||
Op: op,
|
||||
})
|
||||
}
|
||||
|
||||
// mergeStatus is the broadcast-side merge: it updates the in-memory map from
|
||||
// a peer's GalleryProgressEvent without re-publishing to NATS or re-writing
|
||||
// to PostgreSQL. UpdateStatus is the local-write entry point and does both;
|
||||
|
||||
53
core/services/modeladmin/remote_sync.go
Normal file
53
core/services/modeladmin/remote_sync.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package modeladmin
|
||||
|
||||
import (
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/services/messaging"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
// opDelete is the CacheInvalidateEvent.Op value the gallery delete path and the
|
||||
// admin delete endpoint use; a delete must prune (a reload-from-path cannot).
|
||||
const opDelete = "delete"
|
||||
|
||||
// ApplyRemoteChange refreshes this replica's in-memory model state from a peer
|
||||
// replica's model-config change broadcast (messaging.CacheInvalidateEvent on
|
||||
// SubjectCacheInvalidateModels). It is the subscriber-side counterpart to
|
||||
// GalleryService.BroadcastModelsChanged.
|
||||
//
|
||||
// The op matters because LoadModelConfigsFromPath is additive: it loads every
|
||||
// YAML on disk into the loader but never removes an entry whose file is gone.
|
||||
// So a delete cannot be propagated by a plain reload - the deleted element must
|
||||
// be explicitly pruned. Specifically:
|
||||
//
|
||||
// - op == "delete" with a named element: prune that element from the loader.
|
||||
// - otherwise: reload all configs from disk (picks up creates and edits).
|
||||
//
|
||||
// In both cases, when an element is named, any running instance on this replica
|
||||
// is shut down (best-effort) so the next request rebuilds it from the new
|
||||
// config instead of serving the stale one - mirroring what the originating
|
||||
// replica does on a local edit/delete.
|
||||
//
|
||||
// ml may be nil (no running instances to shut down). modelsPath and opts are
|
||||
// forwarded to LoadModelConfigsFromPath.
|
||||
func ApplyRemoteChange(cl *config.ModelConfigLoader, ml *model.ModelLoader, modelsPath string, evt messaging.CacheInvalidateEvent, opts ...config.ConfigLoaderOption) error {
|
||||
if evt.Op == opDelete && evt.Element != "" {
|
||||
cl.RemoveModelConfig(evt.Element)
|
||||
} else if err := cl.LoadModelConfigsFromPath(modelsPath, opts...); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Drop any running instance of the affected model so the next request
|
||||
// rebuilds it from the refreshed config instead of serving the stale one.
|
||||
// Best-effort: the model may not be loaded on this replica, which surfaces
|
||||
// as a benign error here.
|
||||
if ml != nil && evt.Element != "" {
|
||||
if err := ml.ShutdownModel(evt.Element); err != nil {
|
||||
xlog.Debug("ApplyRemoteChange: could not shut down model instance (likely not loaded)",
|
||||
"model", evt.Element, "error", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
80
core/services/modeladmin/remote_sync_test.go
Normal file
80
core/services/modeladmin/remote_sync_test.go
Normal file
@@ -0,0 +1,80 @@
|
||||
package modeladmin
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
"gopkg.in/yaml.v3"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/services/messaging"
|
||||
)
|
||||
|
||||
var _ = Describe("ApplyRemoteChange", func() {
|
||||
var (
|
||||
dir string
|
||||
loader *config.ModelConfigLoader
|
||||
)
|
||||
|
||||
BeforeEach(func() {
|
||||
dir = GinkgoT().TempDir()
|
||||
loader = config.NewModelConfigLoader(dir)
|
||||
})
|
||||
|
||||
writeYAML := func(name string, body map[string]any) {
|
||||
body["name"] = name
|
||||
data, err := yaml.Marshal(body)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(filepath.Join(dir, name+".yaml"), data, 0644)).To(Succeed())
|
||||
}
|
||||
|
||||
It("loads a peer-created config from disk on an install event", func() {
|
||||
// Peer wrote the YAML to the shared models dir; this replica has not
|
||||
// loaded it yet (empty in-memory loader).
|
||||
writeYAML("peer-alias", map[string]any{"alias": "qwen"})
|
||||
_, ok := loader.GetModelConfig("peer-alias")
|
||||
Expect(ok).To(BeFalse(), "precondition: not yet in memory")
|
||||
|
||||
err := ApplyRemoteChange(loader, nil, dir, messaging.CacheInvalidateEvent{
|
||||
Element: "peer-alias", Op: "install",
|
||||
})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
_, ok = loader.GetModelConfig("peer-alias")
|
||||
Expect(ok).To(BeTrue(), "install event must reload the new config from disk")
|
||||
})
|
||||
|
||||
It("prunes a peer-deleted config that a reload-from-path cannot drop", func() {
|
||||
// Model is present in memory (loaded earlier) but its file is now gone
|
||||
// from the shared dir. LoadModelConfigsFromPath is additive, so only an
|
||||
// explicit prune can remove it - this is the cross-replica delete bug.
|
||||
writeYAML("doomed", map[string]any{"alias": "qwen"})
|
||||
Expect(loader.LoadModelConfigsFromPath(dir)).To(Succeed())
|
||||
_, ok := loader.GetModelConfig("doomed")
|
||||
Expect(ok).To(BeTrue(), "precondition: in memory")
|
||||
Expect(os.Remove(filepath.Join(dir, "doomed.yaml"))).To(Succeed())
|
||||
|
||||
err := ApplyRemoteChange(loader, nil, dir, messaging.CacheInvalidateEvent{
|
||||
Element: "doomed", Op: "delete",
|
||||
})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
_, ok = loader.GetModelConfig("doomed")
|
||||
Expect(ok).To(BeFalse(), "delete event must prune the element from memory")
|
||||
})
|
||||
|
||||
It("does a full reload when no element is named", func() {
|
||||
writeYAML("m1", map[string]any{"alias": "qwen"})
|
||||
writeYAML("m2", map[string]any{"alias": "qwen"})
|
||||
|
||||
err := ApplyRemoteChange(loader, nil, dir, messaging.CacheInvalidateEvent{})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
_, ok1 := loader.GetModelConfig("m1")
|
||||
_, ok2 := loader.GetModelConfig("m2")
|
||||
Expect(ok1).To(BeTrue())
|
||||
Expect(ok2).To(BeTrue())
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user