mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-01 13:42:20 -04:00
* feat: add distributed mode (experimental) Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix data races, mutexes, transactions Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix events and tool stream in agent chat Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * use ginkgo Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactoring and consolidation Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactoring and consolidation Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactoring and consolidation Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactoring and consolidation Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactoring and consolidation Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactoring and consolidation Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactoring and consolidation Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactoring and consolidation Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(cron): compute correctly time boundaries avoiding re-triggering Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * enhancements, refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * do not flood of healthy checks Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * do not list obvious backends as text backends Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * tests fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * refactoring and consolidation Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Drop redundant healthcheck Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * enhancements, refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
313 lines
11 KiB
Go
313 lines
11 KiB
Go
package distributed_test
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"os"
|
|
"path/filepath"
|
|
"sync/atomic"
|
|
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/core/services/galleryop"
|
|
"github.com/mudler/LocalAI/core/services/messaging"
|
|
"github.com/mudler/LocalAI/core/services/nodes"
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
"github.com/mudler/LocalAI/pkg/system"
|
|
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
|
|
pgdriver "gorm.io/driver/postgres"
|
|
"gorm.io/gorm"
|
|
"gorm.io/gorm/logger"
|
|
)
|
|
|
|
var _ = Describe("Model and Backend Managers", Label("Distributed"), func() {
|
|
var (
|
|
infra *TestInfra
|
|
db *gorm.DB
|
|
registry *nodes.NodeRegistry
|
|
)
|
|
|
|
BeforeEach(func() {
|
|
infra = SetupInfra("localai_managers_test")
|
|
|
|
var err error
|
|
db, err = gorm.Open(pgdriver.Open(infra.PGURL), &gorm.Config{
|
|
Logger: logger.Default.LogMode(logger.Silent),
|
|
})
|
|
Expect(err).ToNot(HaveOccurred())
|
|
|
|
registry, err = nodes.NewNodeRegistry(db)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
})
|
|
|
|
Context("LocalModelManager", func() {
|
|
var (
|
|
tempDir string
|
|
ss *system.SystemState
|
|
ml *model.ModelLoader
|
|
localMgr *galleryop.LocalModelManager
|
|
)
|
|
|
|
BeforeEach(func() {
|
|
var err error
|
|
tempDir, err = os.MkdirTemp("", "manager-model-test-*")
|
|
Expect(err).ToNot(HaveOccurred())
|
|
|
|
ss, err = system.GetSystemState(system.WithModelPath(tempDir))
|
|
Expect(err).ToNot(HaveOccurred())
|
|
ml = model.NewModelLoader(ss)
|
|
|
|
appCfg := config.NewApplicationConfig()
|
|
appCfg.SystemState = ss
|
|
localMgr = galleryop.NewLocalModelManager(appCfg, ml)
|
|
})
|
|
|
|
AfterEach(func() {
|
|
os.RemoveAll(tempDir)
|
|
})
|
|
|
|
It("should delete a model from the local filesystem", func() {
|
|
// Create a fake model config file
|
|
modelName := "test-model"
|
|
configFile := filepath.Join(tempDir, modelName+".yaml")
|
|
Expect(os.WriteFile(configFile, []byte("name: test-model\n"), 0644)).To(Succeed())
|
|
|
|
err := localMgr.DeleteModel(modelName)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
|
|
// Assert file is gone
|
|
_, err = os.Stat(configFile)
|
|
Expect(os.IsNotExist(err)).To(BeTrue())
|
|
})
|
|
})
|
|
|
|
Context("LocalBackendManager", func() {
|
|
var (
|
|
tempDir string
|
|
ss *system.SystemState
|
|
ml *model.ModelLoader
|
|
localMgr *galleryop.LocalBackendManager
|
|
)
|
|
|
|
BeforeEach(func() {
|
|
var err error
|
|
tempDir, err = os.MkdirTemp("", "manager-backend-test-*")
|
|
Expect(err).ToNot(HaveOccurred())
|
|
|
|
ss, err = system.GetSystemState(system.WithBackendPath(tempDir))
|
|
Expect(err).ToNot(HaveOccurred())
|
|
ml = model.NewModelLoader(ss)
|
|
|
|
appCfg := config.NewApplicationConfig()
|
|
appCfg.SystemState = ss
|
|
localMgr = galleryop.NewLocalBackendManager(appCfg, ml)
|
|
})
|
|
|
|
AfterEach(func() {
|
|
os.RemoveAll(tempDir)
|
|
})
|
|
|
|
It("should delete a backend from the local filesystem", func() {
|
|
// Create a fake backend directory with run.sh
|
|
backendName := "test-backend"
|
|
backendDir := filepath.Join(tempDir, backendName)
|
|
Expect(os.MkdirAll(backendDir, 0750)).To(Succeed())
|
|
Expect(os.WriteFile(filepath.Join(backendDir, "run.sh"), []byte("#!/bin/bash\necho test"), 0755)).To(Succeed())
|
|
|
|
err := localMgr.DeleteBackend(backendName)
|
|
Expect(err).ToNot(HaveOccurred())
|
|
|
|
// Assert directory is gone
|
|
_, err = os.Stat(backendDir)
|
|
Expect(os.IsNotExist(err)).To(BeTrue())
|
|
})
|
|
})
|
|
|
|
Context("DistributedModelManager", func() {
|
|
It("should delete model locally AND send model.delete to worker nodes", func() {
|
|
// Register two nodes with the model
|
|
node1 := &nodes.BackendNode{Name: "dm-n1", Address: "h1:50051"}
|
|
node2 := &nodes.BackendNode{Name: "dm-n2", Address: "h2:50051"}
|
|
Expect(registry.Register(context.Background(), node1, true)).To(Succeed())
|
|
Expect(registry.Register(context.Background(), node2, true)).To(Succeed())
|
|
Expect(registry.SetNodeModel(context.Background(), node1.ID, "big-model", "loaded", "", 0)).To(Succeed())
|
|
Expect(registry.SetNodeModel(context.Background(), node2.ID, "big-model", "loaded", "", 0)).To(Succeed())
|
|
|
|
// Subscribe to model.delete on both node subjects, track receipt
|
|
var deleteCount atomic.Int32
|
|
sub1, err := infra.NC.SubscribeReply(messaging.SubjectNodeModelDelete(node1.ID), func(data []byte, reply func([]byte)) {
|
|
var req messaging.ModelDeleteRequest
|
|
json.Unmarshal(data, &req)
|
|
Expect(req.ModelName).To(Equal("big-model"))
|
|
deleteCount.Add(1)
|
|
resp, _ := json.Marshal(messaging.ModelDeleteReply{Success: true})
|
|
reply(resp)
|
|
})
|
|
Expect(err).ToNot(HaveOccurred())
|
|
defer sub1.Unsubscribe()
|
|
|
|
sub2, err := infra.NC.SubscribeReply(messaging.SubjectNodeModelDelete(node2.ID), func(data []byte, reply func([]byte)) {
|
|
var req messaging.ModelDeleteRequest
|
|
json.Unmarshal(data, &req)
|
|
deleteCount.Add(1)
|
|
resp, _ := json.Marshal(messaging.ModelDeleteReply{Success: true})
|
|
reply(resp)
|
|
})
|
|
Expect(err).ToNot(HaveOccurred())
|
|
defer sub2.Unsubscribe()
|
|
|
|
FlushNATS(infra.NC)
|
|
|
|
// Create temp dir for local model files
|
|
tempDir, err := os.MkdirTemp("", "dist-model-test-*")
|
|
Expect(err).ToNot(HaveOccurred())
|
|
defer os.RemoveAll(tempDir)
|
|
|
|
// Create a fake model config file
|
|
modelFile := filepath.Join(tempDir, "big-model.yaml")
|
|
Expect(os.WriteFile(modelFile, []byte("name: big-model\n"), 0644)).To(Succeed())
|
|
|
|
ss, err := system.GetSystemState(system.WithModelPath(tempDir))
|
|
Expect(err).ToNot(HaveOccurred())
|
|
ml := model.NewModelLoader(ss)
|
|
appCfg := config.NewApplicationConfig()
|
|
appCfg.SystemState = ss
|
|
|
|
adapter := nodes.NewRemoteUnloaderAdapter(registry, infra.NC)
|
|
distMgr := nodes.NewDistributedModelManager(appCfg, ml, adapter)
|
|
|
|
err = distMgr.DeleteModel("big-model")
|
|
Expect(err).ToNot(HaveOccurred())
|
|
|
|
// Local file should be deleted
|
|
_, statErr := os.Stat(modelFile)
|
|
Expect(os.IsNotExist(statErr)).To(BeTrue())
|
|
|
|
// Both workers should have received model.delete
|
|
Eventually(func() int32 { return deleteCount.Load() }, "5s").Should(Equal(int32(2)))
|
|
})
|
|
})
|
|
|
|
Context("DistributedBackendManager", func() {
|
|
It("should delete backend locally AND fan out backend.delete to all healthy nodes", func() {
|
|
// Register 3 nodes: 2 healthy, 1 unhealthy
|
|
node1 := &nodes.BackendNode{Name: "db-n1", Address: "h1:50051"}
|
|
node2 := &nodes.BackendNode{Name: "db-n2", Address: "h2:50051"}
|
|
node3 := &nodes.BackendNode{Name: "db-n3", Address: "h3:50051"}
|
|
Expect(registry.Register(context.Background(), node1, true)).To(Succeed())
|
|
Expect(registry.Register(context.Background(), node2, true)).To(Succeed())
|
|
Expect(registry.Register(context.Background(), node3, true)).To(Succeed())
|
|
Expect(registry.MarkUnhealthy(context.Background(), node3.ID)).To(Succeed())
|
|
|
|
// Subscribe to backend.delete on all 3 nodes
|
|
var deleteCount atomic.Int32
|
|
sub1, err := infra.NC.SubscribeReply(messaging.SubjectNodeBackendDelete(node1.ID), func(data []byte, reply func([]byte)) {
|
|
var req messaging.BackendDeleteRequest
|
|
json.Unmarshal(data, &req)
|
|
Expect(req.Backend).To(Equal("my-backend"))
|
|
deleteCount.Add(1)
|
|
resp, _ := json.Marshal(messaging.BackendDeleteReply{Success: true})
|
|
reply(resp)
|
|
})
|
|
Expect(err).ToNot(HaveOccurred())
|
|
defer sub1.Unsubscribe()
|
|
|
|
sub2, err := infra.NC.SubscribeReply(messaging.SubjectNodeBackendDelete(node2.ID), func(data []byte, reply func([]byte)) {
|
|
var req messaging.BackendDeleteRequest
|
|
json.Unmarshal(data, &req)
|
|
deleteCount.Add(1)
|
|
resp, _ := json.Marshal(messaging.BackendDeleteReply{Success: true})
|
|
reply(resp)
|
|
})
|
|
Expect(err).ToNot(HaveOccurred())
|
|
defer sub2.Unsubscribe()
|
|
|
|
var unhealthyReceived atomic.Int32
|
|
sub3, err := infra.NC.SubscribeReply(messaging.SubjectNodeBackendDelete(node3.ID), func(data []byte, reply func([]byte)) {
|
|
unhealthyReceived.Add(1)
|
|
resp, _ := json.Marshal(messaging.BackendDeleteReply{Success: true})
|
|
reply(resp)
|
|
})
|
|
Expect(err).ToNot(HaveOccurred())
|
|
defer sub3.Unsubscribe()
|
|
|
|
FlushNATS(infra.NC)
|
|
|
|
// Create temp dir for local backend files
|
|
tempDir, err := os.MkdirTemp("", "dist-backend-test-*")
|
|
Expect(err).ToNot(HaveOccurred())
|
|
defer os.RemoveAll(tempDir)
|
|
|
|
// Create a fake backend directory
|
|
backendDir := filepath.Join(tempDir, "my-backend")
|
|
Expect(os.MkdirAll(backendDir, 0750)).To(Succeed())
|
|
Expect(os.WriteFile(filepath.Join(backendDir, "run.sh"), []byte("#!/bin/bash\necho test"), 0755)).To(Succeed())
|
|
|
|
ss, err := system.GetSystemState(system.WithBackendPath(tempDir))
|
|
Expect(err).ToNot(HaveOccurred())
|
|
ml := model.NewModelLoader(ss)
|
|
appCfg := config.NewApplicationConfig()
|
|
appCfg.SystemState = ss
|
|
|
|
adapter := nodes.NewRemoteUnloaderAdapter(registry, infra.NC)
|
|
distMgr := nodes.NewDistributedBackendManager(appCfg, ml, adapter, registry)
|
|
|
|
err = distMgr.DeleteBackend("my-backend")
|
|
Expect(err).ToNot(HaveOccurred())
|
|
|
|
// Local backend dir should be deleted
|
|
_, statErr := os.Stat(backendDir)
|
|
Expect(os.IsNotExist(statErr)).To(BeTrue())
|
|
|
|
// 2 healthy nodes should have received backend.delete
|
|
Eventually(func() int32 { return deleteCount.Load() }, "5s").Should(Equal(int32(2)))
|
|
|
|
// Unhealthy node should NOT have received backend.delete
|
|
Consistently(func() int32 { return unhealthyReceived.Load() }, "1s").Should(Equal(int32(0)))
|
|
})
|
|
|
|
It("should succeed when backend exists only on remote workers (not locally)", func() {
|
|
// Register a healthy node
|
|
node1 := &nodes.BackendNode{Name: "db-remote-only", Address: "h1:50051"}
|
|
Expect(registry.Register(context.Background(), node1, true)).To(Succeed())
|
|
|
|
var deleteCount atomic.Int32
|
|
sub1, err := infra.NC.SubscribeReply(messaging.SubjectNodeBackendDelete(node1.ID), func(data []byte, reply func([]byte)) {
|
|
var req messaging.BackendDeleteRequest
|
|
json.Unmarshal(data, &req)
|
|
Expect(req.Backend).To(Equal("remote-only-backend"))
|
|
deleteCount.Add(1)
|
|
resp, _ := json.Marshal(messaging.BackendDeleteReply{Success: true})
|
|
reply(resp)
|
|
})
|
|
Expect(err).ToNot(HaveOccurred())
|
|
defer sub1.Unsubscribe()
|
|
|
|
FlushNATS(infra.NC)
|
|
|
|
// Use a temp dir with NO local backend directory — simulates frontend node
|
|
tempDir, err := os.MkdirTemp("", "dist-backend-remote-only-*")
|
|
Expect(err).ToNot(HaveOccurred())
|
|
defer os.RemoveAll(tempDir)
|
|
|
|
ss, err := system.GetSystemState(system.WithBackendPath(tempDir))
|
|
Expect(err).ToNot(HaveOccurred())
|
|
ml := model.NewModelLoader(ss)
|
|
appCfg := config.NewApplicationConfig()
|
|
appCfg.SystemState = ss
|
|
|
|
adapter := nodes.NewRemoteUnloaderAdapter(registry, infra.NC)
|
|
distMgr := nodes.NewDistributedBackendManager(appCfg, ml, adapter, registry)
|
|
|
|
// Should NOT return an error even though the backend doesn't exist locally
|
|
err = distMgr.DeleteBackend("remote-only-backend")
|
|
Expect(err).ToNot(HaveOccurred())
|
|
|
|
// The healthy worker should still receive the deletion request
|
|
Eventually(func() int32 { return deleteCount.Load() }, "5s").Should(Equal(int32(1)))
|
|
})
|
|
})
|
|
})
|