mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-29 19:19:19 -04:00
When the LocalAI frontend deployment is scaled past one replica, the UI's
/api/operations poll round-robins between pods. Each pod kept the OpCache
(galleryID->jobID), OpStatus map, and the post-install in-memory caches
(ModelConfigLoader, UpgradeChecker) purely in-process. Reads never
consulted PostgreSQL or NATS even though writes already published to PG.
Symptoms:
- A user installing a model on replica A saw the operation card flicker
in and out as the load balancer alternated.
- The Models page re-fetched the whole gallery on every flicker because
useEffect([operations.length]) re-fires when the count changes.
- A chat completion that landed on replica B after the install completed
on replica A failed to find the new model — B's ModelConfigLoader was
still the old one because nothing told it to reload from disk.
- The UpgradeChecker 6-hour cache stayed stale on peer replicas after a
backend upgrade, so /api/backends/upgrades kept surfacing an upgrade
that had already shipped.
Mirror the jobs Dispatcher pattern for gallery ops:
- OpCache learns SetMessagingClient/SetGalleryStore + a Start(ctx) that
hydrates from PostgreSQL and subscribes to gallery.opcache.{start,end}.
Set/SetBackend now upsert cache_key + is_backend_op on the gallery_
operations row and broadcast OpCacheEvent so peers merge it in. The
hydrate path uses a new GalleryStore.ListActive() (status in {pending,
downloading, processing} and updated within 30 min).
- GalleryService.SubscribeBroadcasts wires a SubjectGalleryProgress-
Wildcard subscriber that calls a new lock-light mergeStatus into the
local statuses map, plus a SubjectGalleryCancelWildcard subscriber that
runs the locally-registered cancel func. Hydrate() restores active rows
from PostgreSQL on startup so a freshly-started replica is not
observably empty mid-install. CancelOperation tolerates the cancel func
living on a different replica and publishes anyway.
- modelHandler and backendHandler publish on the new
SubjectCacheInvalidateModels / SubjectCacheInvalidateBackends after
a successful install/delete/upgrade. SubscribeBroadcasts wires peers
to refresh: OnModelsChanged (re-runs LoadModelConfigsFromPath) and
OnBackendOpCompleted (re-triggers UpgradeChecker). The originating
replica reloads inline so it never enters the broadcast handler.
- OpStatus.Error (an error interface) flat-marshalled to "{}" over JSON,
so a failed install replicated to a peer arrived with a nil error and
the UI's failure banner never appeared. Add MarshalJSON/UnmarshalJSON
via an opStatusWire shim that round-trips Error as a string.
- UpdateStatus and CancelOperation now drop the mutex before publishing
to NATS or persisting to PostgreSQL. The wildcard subscriber's
mergeStatus loops back into the same service on the publishing replica
and would deadlock otherwise; this also prevents future PG round-trips
from stalling concurrent readers on every progress tick.
Tests cover the OpStatus error round-trip, OpCache propagation through a
shared in-memory bus, OpCache PostgreSQL hydration (active-only),
GalleryService progress + cancel broadcast, Nodes preservation across a
peer's bare progress tick, GalleryService hydration from PG, and the
two cache-invalidation broadcasts (models + backends). 44 specs total
in galleryop; routes/operations specs and jobs/agents suites still pass.
Assisted-by: claude-code:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
204 lines
8.2 KiB
Go
204 lines
8.2 KiB
Go
package distributed
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"gorm.io/gorm"
|
|
"gorm.io/gorm/clause"
|
|
)
|
|
|
|
// GalleryOperationRecord tracks model/backend download operations in PostgreSQL.
|
|
//
|
|
// CacheKey and IsBackendOp mirror the in-memory OpCache held by each frontend
|
|
// replica. They are written when a request first lands so a freshly-started
|
|
// (or freshly-routed-to) replica can rebuild its OpCache from this table
|
|
// instead of returning an empty `/api/operations` payload while the real
|
|
// operation is still in flight on a peer.
|
|
type GalleryOperationRecord struct {
|
|
ID string `gorm:"primaryKey;size:36" json:"id"`
|
|
UserID string `gorm:"index;size:36" json:"user_id,omitempty"`
|
|
GalleryElementName string `gorm:"size:255" json:"gallery_element_name"`
|
|
CacheKey string `gorm:"index;size:512" json:"cache_key,omitempty"` // OpCache key (galleryID or node:<id>:<backend>)
|
|
IsBackendOp bool `json:"is_backend_op"` // true if installed via SetBackend
|
|
OpType string `gorm:"size:32" json:"op_type"` // "model_install", "model_delete", "backend_install"
|
|
Status string `gorm:"size:32;default:pending" json:"status"` // pending, downloading, processing, completed, failed, cancelled
|
|
Progress float64 `json:"progress"` // 0.0 to 1.0
|
|
Message string `gorm:"type:text" json:"message,omitempty"`
|
|
Error string `gorm:"type:text" json:"error,omitempty"`
|
|
FileName string `gorm:"size:512" json:"file_name,omitempty"`
|
|
TotalFileSize string `gorm:"size:32" json:"total_file_size,omitempty"`
|
|
DownloadedFileSize string `gorm:"size:32" json:"downloaded_file_size,omitempty"`
|
|
FrontendID string `gorm:"size:36" json:"frontend_id,omitempty"` // which instance is processing
|
|
Cancellable bool `json:"cancellable"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
UpdatedAt time.Time `json:"updated_at"`
|
|
}
|
|
|
|
// activeStatuses lists the gallery_operations.status values that represent an
|
|
// operation a replica should still surface via /api/operations. Hydration and
|
|
// the dedup lookup share this set so the two paths never disagree about what
|
|
// "still active" means.
|
|
var activeStatuses = []string{"pending", "downloading", "processing"}
|
|
|
|
func (GalleryOperationRecord) TableName() string { return "gallery_operations" }
|
|
|
|
// GalleryStore manages gallery operation state in PostgreSQL.
|
|
type GalleryStore struct {
|
|
db *gorm.DB
|
|
}
|
|
|
|
// NewGalleryStore creates a new GalleryStore and auto-migrates.
|
|
func NewGalleryStore(db *gorm.DB) (*GalleryStore, error) {
|
|
if err := db.AutoMigrate(&GalleryOperationRecord{}); err != nil {
|
|
return nil, fmt.Errorf("migrating gallery_operations: %w", err)
|
|
}
|
|
return &GalleryStore{db: db}, nil
|
|
}
|
|
|
|
// Create stores a new gallery operation. Tolerates a row already existing
|
|
// for this ID — OpCache.Set may have written a placeholder row via
|
|
// UpsertCacheKey before the galleryop service goroutine called Create, and
|
|
// in that case we want to fill in the descriptive columns (gallery element
|
|
// name, op type, status) rather than fail with a primary-key conflict.
|
|
// CacheKey and IsBackendOp are intentionally not in DoUpdates so the
|
|
// placeholder's values win.
|
|
func (s *GalleryStore) Create(op *GalleryOperationRecord) error {
|
|
if op.ID == "" {
|
|
op.ID = uuid.New().String()
|
|
}
|
|
op.CreatedAt = time.Now()
|
|
op.UpdatedAt = op.CreatedAt
|
|
return s.db.Clauses(clause.OnConflict{
|
|
Columns: []clause.Column{{Name: "id"}},
|
|
DoUpdates: clause.AssignmentColumns([]string{
|
|
"gallery_element_name", "op_type", "status",
|
|
"frontend_id", "user_id", "cancellable", "updated_at",
|
|
}),
|
|
}).Create(op).Error
|
|
}
|
|
|
|
// UpdateProgress updates progress for an operation.
|
|
func (s *GalleryStore) UpdateProgress(id string, progress float64, message, downloadedSize string) error {
|
|
return s.db.Model(&GalleryOperationRecord{}).Where("id = ?", id).Updates(map[string]any{
|
|
"progress": progress,
|
|
"message": message,
|
|
"downloaded_file_size": downloadedSize,
|
|
"updated_at": time.Now(),
|
|
}).Error
|
|
}
|
|
|
|
// UpdateStatus updates the status of an operation.
|
|
func (s *GalleryStore) UpdateStatus(id, status, errMsg string) error {
|
|
updates := map[string]any{
|
|
"status": status,
|
|
"updated_at": time.Now(),
|
|
}
|
|
if errMsg != "" {
|
|
updates["error"] = errMsg
|
|
}
|
|
return s.db.Model(&GalleryOperationRecord{}).Where("id = ?", id).Updates(updates).Error
|
|
}
|
|
|
|
// Get retrieves an operation by ID.
|
|
func (s *GalleryStore) Get(id string) (*GalleryOperationRecord, error) {
|
|
var op GalleryOperationRecord
|
|
if err := s.db.First(&op, "id = ?", id).Error; err != nil {
|
|
return nil, err
|
|
}
|
|
return &op, nil
|
|
}
|
|
|
|
// List returns all operations, optionally filtered by status.
|
|
func (s *GalleryStore) List(status string) ([]GalleryOperationRecord, error) {
|
|
var ops []GalleryOperationRecord
|
|
q := s.db.Order("created_at DESC")
|
|
if status != "" {
|
|
q = q.Where("status = ?", status)
|
|
}
|
|
return ops, q.Find(&ops).Error
|
|
}
|
|
|
|
// ListActive returns operations still considered in-flight — used by replicas
|
|
// to rehydrate their in-memory OpCache + statuses on startup. Stale records
|
|
// (older than 30 minutes without an update) are excluded so a crashed peer's
|
|
// orphaned rows never resurrect on a healthy replica; the existing CleanStale
|
|
// reaper eventually marks them failed.
|
|
func (s *GalleryStore) ListActive() ([]GalleryOperationRecord, error) {
|
|
var ops []GalleryOperationRecord
|
|
staleCutoff := time.Now().Add(-30 * time.Minute)
|
|
err := s.db.Where("status IN ? AND updated_at > ?", activeStatuses, staleCutoff).
|
|
Order("created_at DESC").Find(&ops).Error
|
|
return ops, err
|
|
}
|
|
|
|
// UpsertCacheKey records the in-memory OpCache key + IsBackendOp flag on the
|
|
// gallery_operations row, creating the row if it does not exist yet.
|
|
//
|
|
// Why upsert: OpCache.Set is called by the HTTP admission handler before the
|
|
// galleryop service goroutine processes the operation and calls Create. If
|
|
// OpCache wrote with a plain Updates() those columns would silently be lost
|
|
// in the window between the two, so peer replicas hydrating in that window
|
|
// would still rebuild an empty OpCache. Upsert closes that window.
|
|
func (s *GalleryStore) UpsertCacheKey(id, cacheKey string, isBackend bool) error {
|
|
now := time.Now()
|
|
rec := GalleryOperationRecord{
|
|
ID: id,
|
|
CacheKey: cacheKey,
|
|
IsBackendOp: isBackend,
|
|
Status: "pending",
|
|
CreatedAt: now,
|
|
UpdatedAt: now,
|
|
}
|
|
return s.db.Clauses(clause.OnConflict{
|
|
Columns: []clause.Column{{Name: "id"}},
|
|
DoUpdates: clause.Assignments(map[string]any{
|
|
"cache_key": cacheKey,
|
|
"is_backend_op": isBackend,
|
|
"updated_at": now,
|
|
}),
|
|
}).Create(&rec).Error
|
|
}
|
|
|
|
// FindDuplicate checks if another instance is already downloading the same element.
|
|
// Only considers records updated within the last 30 minutes as active — older
|
|
// in-progress records are assumed to be stale (crashed instance).
|
|
func (s *GalleryStore) FindDuplicate(elementName string) (*GalleryOperationRecord, error) {
|
|
var op GalleryOperationRecord
|
|
staleCutoff := time.Now().Add(-30 * time.Minute)
|
|
err := s.db.Where("gallery_element_name = ? AND status IN ? AND updated_at > ?", elementName,
|
|
activeStatuses, staleCutoff).First(&op).Error
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &op, nil
|
|
}
|
|
|
|
// Cancel marks an operation as cancelled.
|
|
func (s *GalleryStore) Cancel(id string) error {
|
|
return s.UpdateStatus(id, "cancelled", "")
|
|
}
|
|
|
|
// CleanStale marks abandoned in-progress operations as failed.
|
|
// Should be called on startup to recover from crashed instances that
|
|
// left records in pending/downloading/processing state.
|
|
func (s *GalleryStore) CleanStale(age time.Duration) error {
|
|
cutoff := time.Now().Add(-age)
|
|
return s.db.Model(&GalleryOperationRecord{}).
|
|
Where("updated_at < ? AND status IN ?", cutoff, activeStatuses).
|
|
Updates(map[string]any{
|
|
"status": "failed",
|
|
"error": "stale operation cleaned up on startup",
|
|
"updated_at": time.Now(),
|
|
}).Error
|
|
}
|
|
|
|
// CleanOld removes operations older than the given duration.
|
|
func (s *GalleryStore) CleanOld(retention time.Duration) error {
|
|
cutoff := time.Now().Add(-retention)
|
|
return s.db.Where("created_at < ? AND status IN ?", cutoff,
|
|
[]string{"completed", "failed", "cancelled"}).
|
|
Delete(&GalleryOperationRecord{}).Error
|
|
}
|