mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-30 11:36:31 -04:00
* feat(gallery): verify backend OCI images with keyless cosign Close a trust gap where a registry compromise or MITM could silently replace a backend image: the gallery YAML tells LocalAI which image to pull, but until now nothing verified the bytes came from our CI. Consumer (pkg/oci/cosignverify): - New package using sigstore-go to verify keyless-cosign signatures. - OCI 1.1 referrers API + new bundle format (no legacy :tag.sig). - Policy fields: Issuer / IssuerRegex / Identity / IdentityRegex / NotBefore. NotBefore is the revocation lever — keyless Fulcio certs are ephemeral so revocation is policy-side; advancing not_before in the gallery YAML invalidates every signature predating the cutoff. - TUF trusted root cached process-wide so N backends from one gallery do 1 fetch, not N. Plumbing: - pkg/downloader: ImageVerifier interface + WithImageVerifier option threaded through DownloadFileWithContext. Verification runs between oci.GetImage and oci.ExtractOCIImage, with digest pinning via pinnedImageRef to close the TOCTOU window. Skips the verifier's HEAD when the ref is already digest-pinned. - core/config: Gallery.Verification YAML block. - core/gallery: backendDownloadOptions builds the verifier from the policy; applied on initial URI, mirrors, and tag fallbacks. - core/gallery/upgrade: the upgrade path now routes through the same options builder. A regression Ginkgo spec pins this contract — without it, UpgradeBackend silently bypassed verification. - core/cli: --require-backend-integrity (LOCALAI_REQUIRE_BACKEND_INTEGRITY) escalates missing policy / empty SHA256 from warn to hard-fail. Producer (.github/workflows/backend_merge.yml): - id-token: write at job scope (PR-fork-safe via existing event gate). - sigstore/cosign-installer@v3 pinned to v2.4.1. - After each docker buildx imagetools create, resolve the manifest list digest and run cosign sign --recursive --new-bundle-format --registry-referrers-mode=oci-1-1 against repo@digest. --recursive signs the index and every per-arch entry, matching how the consumer resolves a tag to a platform-specific manifest before verifying. Rollout: backend/index.yaml has no `verification:` block yet, so this PR is backward-compatible — installs proceed with a warning until the gallery is populated. Strict mode is opt-in. Assisted-by: claude-code:claude-opus-4-7 [Bash] [Edit] [Read] [Write] [WebSearch] [WebFetch] Signed-off-by: Richard Palethorpe <io@richiejp.com> * refactor(gallery): plumb RequireBackendIntegrity through config instead of env The previous implementation re-exported the --require-backend-integrity CLI flag into LOCALAI_REQUIRE_BACKEND_INTEGRITY via os.Setenv, then re-read it in core/gallery via os.Getenv. This leaked process state into the gallery package and made the flag impossible to override per-call or test without touching the env. Add RequireBackendIntegrity to ApplicationConfig (with a matching WithRequireBackendIntegrity AppOption) and thread the bool through every install/upgrade path: InstallBackend, InstallBackendFromGallery, UpgradeBackend, InstallModelFromGallery, InstallExternalBackend, ApplyGalleryFromString/File, startup.InstallModels. Worker subcommands gain the same env-bound flag on WorkerFlags so distributed-worker installs honor it consistently with the worker daemon path. Add a forbidigo lint rule against os.Getenv / os.LookupEnv / os.Environ to keep the env-leak pattern from creeping back. Existing offenders (p2p, config loaders, etc.) are baseline-grandfathered by the existing new-from-merge-base: origin/master setting; targeted path exclusions cover the legitimate cases — kong CLI entry points, backend subprocesses, system capability probes, gRPC AUTH_TOKEN inheritance, test gating env vars. Assisted-by: claude-code:claude-opus-4-7 Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>
246 lines
7.4 KiB
Go
246 lines
7.4 KiB
Go
package application
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/core/gallery"
|
|
"github.com/mudler/LocalAI/core/services/advisorylock"
|
|
"github.com/mudler/LocalAI/core/services/galleryop"
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
"github.com/mudler/LocalAI/pkg/system"
|
|
"github.com/mudler/xlog"
|
|
"gorm.io/gorm"
|
|
)
|
|
|
|
// UpgradeChecker periodically checks for backend upgrades and optionally
|
|
// auto-upgrades them. It caches the last check results for API queries.
|
|
//
|
|
// In standalone mode it runs a simple ticker loop.
|
|
// In distributed mode it uses a PostgreSQL advisory lock so that only one
|
|
// frontend instance performs periodic checks and auto-upgrades at a time.
|
|
type UpgradeChecker struct {
|
|
appConfig *config.ApplicationConfig
|
|
modelLoader *model.ModelLoader
|
|
galleries []config.Gallery
|
|
systemState *system.SystemState
|
|
db *gorm.DB // non-nil in distributed mode
|
|
// backendManagerFn lazily returns the current backend manager (may be
|
|
// swapped from Local to Distributed after startup). Pulled through each
|
|
// check so the UpgradeChecker uses whichever is active. In distributed
|
|
// mode this ensures CheckUpgrades asks workers instead of the (empty)
|
|
// frontend filesystem — fixing the bug where upgrades never surfaced.
|
|
backendManagerFn func() galleryop.BackendManager
|
|
|
|
checkInterval time.Duration
|
|
stop chan struct{}
|
|
done chan struct{}
|
|
triggerCh chan struct{}
|
|
|
|
mu sync.RWMutex
|
|
lastUpgrades map[string]gallery.UpgradeInfo
|
|
lastCheckTime time.Time
|
|
}
|
|
|
|
// NewUpgradeChecker creates a new UpgradeChecker service.
|
|
// Pass db=nil for standalone mode, or a *gorm.DB for distributed mode
|
|
// (uses advisory locks so only one instance runs periodic checks).
|
|
// backendManagerFn is optional; when set, CheckUpgrades is routed through
|
|
// the active backend manager — required in distributed mode so the check
|
|
// aggregates from workers rather than the empty frontend filesystem.
|
|
func NewUpgradeChecker(appConfig *config.ApplicationConfig, ml *model.ModelLoader, db *gorm.DB, backendManagerFn func() galleryop.BackendManager) *UpgradeChecker {
|
|
return &UpgradeChecker{
|
|
appConfig: appConfig,
|
|
modelLoader: ml,
|
|
galleries: appConfig.BackendGalleries,
|
|
systemState: appConfig.SystemState,
|
|
db: db,
|
|
backendManagerFn: backendManagerFn,
|
|
checkInterval: 6 * time.Hour,
|
|
stop: make(chan struct{}),
|
|
done: make(chan struct{}),
|
|
triggerCh: make(chan struct{}, 1),
|
|
lastUpgrades: make(map[string]gallery.UpgradeInfo),
|
|
}
|
|
}
|
|
|
|
// Run starts the upgrade checker loop. It waits 30 seconds after startup,
|
|
// performs an initial check, then re-checks every 6 hours.
|
|
//
|
|
// In distributed mode, periodic checks are guarded by a PostgreSQL advisory
|
|
// lock so only one frontend instance runs them. On-demand triggers (TriggerCheck)
|
|
// and the initial check always run locally for fast API response cache warming.
|
|
func (uc *UpgradeChecker) Run(ctx context.Context) {
|
|
defer close(uc.done)
|
|
|
|
// Initial delay: don't slow down startup. Short enough that operators
|
|
// don't stare at an empty upgrade banner for long; long enough that
|
|
// workers have registered and reported their installed backends.
|
|
initialDelay := 10 * time.Second
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-uc.stop:
|
|
return
|
|
case <-time.After(initialDelay):
|
|
}
|
|
|
|
// First check always runs locally (to warm the cache on this instance)
|
|
uc.runCheck(ctx)
|
|
|
|
if uc.db != nil {
|
|
// Distributed mode: use advisory lock for periodic checks.
|
|
// RunLeaderLoop ticks every checkInterval; only the lock holder executes.
|
|
go advisorylock.RunLeaderLoop(ctx, uc.db, advisorylock.KeyBackendUpgradeCheck, uc.checkInterval, func() {
|
|
uc.runCheck(ctx)
|
|
})
|
|
|
|
// Still listen for on-demand triggers (from API / settings change)
|
|
// and stop signal — these run on every instance.
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-uc.stop:
|
|
return
|
|
case <-uc.triggerCh:
|
|
uc.runCheck(ctx)
|
|
}
|
|
}
|
|
} else {
|
|
// Standalone mode: simple ticker loop
|
|
ticker := time.NewTicker(uc.checkInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-uc.stop:
|
|
return
|
|
case <-ticker.C:
|
|
uc.runCheck(ctx)
|
|
case <-uc.triggerCh:
|
|
uc.runCheck(ctx)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Shutdown stops the upgrade checker loop.
|
|
func (uc *UpgradeChecker) Shutdown() {
|
|
close(uc.stop)
|
|
<-uc.done
|
|
}
|
|
|
|
// TriggerCheck forces an immediate upgrade check on this instance.
|
|
func (uc *UpgradeChecker) TriggerCheck() {
|
|
select {
|
|
case uc.triggerCh <- struct{}{}:
|
|
default:
|
|
// Already triggered, skip
|
|
}
|
|
}
|
|
|
|
// GetAvailableUpgrades returns the cached upgrade check results.
|
|
func (uc *UpgradeChecker) GetAvailableUpgrades() map[string]gallery.UpgradeInfo {
|
|
uc.mu.RLock()
|
|
defer uc.mu.RUnlock()
|
|
|
|
// Return a copy to avoid races
|
|
result := make(map[string]gallery.UpgradeInfo, len(uc.lastUpgrades))
|
|
for k, v := range uc.lastUpgrades {
|
|
result[k] = v
|
|
}
|
|
return result
|
|
}
|
|
|
|
func (uc *UpgradeChecker) runCheck(ctx context.Context) {
|
|
var (
|
|
upgrades map[string]gallery.UpgradeInfo
|
|
err error
|
|
)
|
|
if uc.backendManagerFn != nil {
|
|
if bm := uc.backendManagerFn(); bm != nil {
|
|
upgrades, err = bm.CheckUpgrades(ctx)
|
|
}
|
|
}
|
|
if upgrades == nil && err == nil {
|
|
upgrades, err = gallery.CheckBackendUpgrades(ctx, uc.galleries, uc.systemState)
|
|
}
|
|
|
|
uc.mu.Lock()
|
|
uc.lastCheckTime = time.Now()
|
|
if err != nil {
|
|
xlog.Debug("Backend upgrade check failed", "error", err)
|
|
uc.mu.Unlock()
|
|
return
|
|
}
|
|
uc.lastUpgrades = upgrades
|
|
uc.mu.Unlock()
|
|
|
|
if len(upgrades) == 0 {
|
|
xlog.Debug("All backends up to date")
|
|
return
|
|
}
|
|
|
|
// Log available upgrades
|
|
for name, info := range upgrades {
|
|
if info.AvailableVersion != "" {
|
|
xlog.Info("Backend upgrade available",
|
|
"backend", name,
|
|
"installed", info.InstalledVersion,
|
|
"available", info.AvailableVersion)
|
|
} else {
|
|
xlog.Info("Backend upgrade available (new build)",
|
|
"backend", name)
|
|
}
|
|
}
|
|
|
|
// Auto-upgrade if enabled. Route through the active BackendManager so
|
|
// distributed-mode upgrades fan out to workers via NATS — calling
|
|
// gallery.UpgradeBackend directly would look up the backend on the
|
|
// frontend filesystem, which is empty in distributed mode and produces
|
|
// "backend not found" while the cluster still reports an upgrade.
|
|
if uc.appConfig.AutoUpgradeBackends {
|
|
var bm galleryop.BackendManager
|
|
if uc.backendManagerFn != nil {
|
|
bm = uc.backendManagerFn()
|
|
}
|
|
for name, info := range upgrades {
|
|
xlog.Info("Auto-upgrading backend", "backend", name,
|
|
"from", info.InstalledVersion, "to", info.AvailableVersion)
|
|
var err error
|
|
if bm != nil {
|
|
err = bm.UpgradeBackend(ctx, name, nil)
|
|
} else {
|
|
err = gallery.UpgradeBackend(ctx, uc.systemState, uc.modelLoader,
|
|
uc.galleries, name, nil, uc.appConfig.RequireBackendIntegrity)
|
|
}
|
|
if err != nil {
|
|
xlog.Error("Failed to auto-upgrade backend",
|
|
"backend", name, "error", err)
|
|
} else {
|
|
xlog.Info("Backend upgraded successfully", "backend", name,
|
|
"version", info.AvailableVersion)
|
|
}
|
|
}
|
|
// Re-check to update cache after upgrades. Route through the same
|
|
// BackendManager so distributed mode reflects the worker view.
|
|
var freshUpgrades map[string]gallery.UpgradeInfo
|
|
var freshErr error
|
|
if bm != nil {
|
|
freshUpgrades, freshErr = bm.CheckUpgrades(ctx)
|
|
} else {
|
|
freshUpgrades, freshErr = gallery.CheckBackendUpgrades(ctx, uc.galleries, uc.systemState)
|
|
}
|
|
if freshErr == nil {
|
|
uc.mu.Lock()
|
|
uc.lastUpgrades = freshUpgrades
|
|
uc.mu.Unlock()
|
|
}
|
|
}
|
|
}
|