chore(deps): bump torch in /backend/python/vllm

Bumps torch from 2.9.1+cpu to 2.12.1+xpu. --- updated-dependencies: - dependency-name: torch dependency-version: 2.12.1+xpu dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>
2026-07-02 12:26:49 -04:00 · 2026-07-01 07:03:43 +00:00
31 changed files with 31 additions and 510 deletions
--- a/11
+++ b/11
@@ -171,17 +171,6 @@ RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
    ln -s /opt/rocm-**/lib/llvm/lib/libomp.so /usr/lib/libomp.so \
    ; fi

-# ROCm's bundled libdrm_amdgpu is built with a hardcoded fallback lookup path
-# for the ASIC ID table (/opt/amdgpu/share/libdrm/amdgpu.ids), which only exists
-# if AMD's full amdgpu graphics/DKMS stack is installed. This compute-only image
-# doesn't have it, so hipblas/rocBLAS log "No such file or directory" on every
-# model load and can fail to identify the GPU. Point it at the equivalent file
-# Ubuntu's libdrm-common package already ships.
-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ -f /usr/share/libdrm/amdgpu.ids ] && [ ! -e /opt/amdgpu/share/libdrm/amdgpu.ids ]; then \
-    mkdir -p /opt/amdgpu/share/libdrm && \
-    ln -s /usr/share/libdrm/amdgpu.ids /opt/amdgpu/share/libdrm/amdgpu.ids \
-    ; fi
-
 RUN expr "${BUILD_TYPE}" = intel && echo "intel" > /run/localai/capability || echo "not intel"

 # Cuda
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=068b173649f2fd8dc96b35ada5a0b76d8985105d
+IK_LLAMA_VERSION?=29431b31c89e79c10f8736e8f2742485ba1713d6
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=4fc4ec5541b243957ae5099edb67372f8f3b550e
+LLAMA_VERSION?=6f4f53f2b7da54fcdbbecaaa734337c337ad6176
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=fcbc8718e654995e3bd2d0c98bcb8e55e297d23c
+CRISPASR_VERSION?=3b93758f9725d400eca82976f895e4cec3f31260
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -1,6 +1,6 @@
 # parakeet-cpp backend Makefile.
 #
-# Upstream pin lives below as PARAKEET_VERSION?=e8acc6172a94e20a952cf1843decace5d771a94b
+# Upstream pin lives below as PARAKEET_VERSION?=f469a57270a1cc4554acb15febf60e56619673b9
 # (.github/bump_deps.sh) can find and update it - matches the
 # whisper.cpp / ds4 / vibevoice-cpp convention.
 #
@@ -15,7 +15,7 @@
 # That's what the L0 smoke test uses. The default target below does the
 # proper clone-at-pin + cmake build so CI doesn't need a side-checkout.

-PARAKEET_VERSION?=e8acc6172a94e20a952cf1843decace5d771a94b
+PARAKEET_VERSION?=f469a57270a1cc4554acb15febf60e56619673b9
 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp

 GOCMD?=go
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=3590aa8d626e671a1b1dc84506ea2932a243a480
+STABLEDIFFUSION_GGML_VERSION?=484baa41e5e006c52dcd4addc38c830b9489745f

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=6fc7c33b4c3a2cec83e4b65abd5e96a890480375
+WHISPER_CPP_VERSION?=0874de3e8e8e48361dba85c7fe6d176f008bf158
 SO_TARGET?=libgowhisper.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -748,12 +748,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        # When (A) native streaming ran cleanly, per-delta yields above already
        # delivered everything — do NOT extract again on the full text or we'd
        # duplicate content/tool_calls into the final chunk.
-        # NOTE: `native_streaming` is a capability flag ("streaming parser is
-        # available"), not a state flag ("streaming actually ran"). For
-        # non-streaming requests it is still True but the per-delta loop was
-        # never entered, so we MUST still run extract_tool_calls here. Hence
-        # the explicit `streaming and …` guard on both branches.
-        if has_tool_parser and not (streaming and native_streaming and not native_streaming_error):
+        if has_tool_parser and not (native_streaming and not native_streaming_error):
            try:
                tp = tp_instance
                if tp is None:
@@ -775,7 +770,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        ))
            except Exception as e:
                print(f"Tool parser error: {e}", file=sys.stderr)
-        elif streaming and native_streaming and not native_streaming_error:
+        elif native_streaming and not native_streaming_error:
            # Per-delta path already emitted content + tool_calls; the final
            # chat_delta should carry only metadata (token counts, logprobs).
            content = ""
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -104,7 +104,7 @@ if [ "$(uname -s)" = "Darwin" ]; then
    # can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
    # vllm pin (requirements-cublas13-after.txt, bumped independently against
    # vllm/vllm) until vllm-metal supports a newer vLLM.
-    VLLM_METAL_VERSION="v0.3.0.dev20260701132215"
+    VLLM_METAL_VERSION="v0.3.0.dev20260628073537"

    # The coupled vLLM source version is whatever this vllm-metal release builds
    # against -- it declares it in its own installer as `vllm_v=`. Derive it from
--- a/backend/python/vllm/requirements-cpu.txt
+++ b/backend/python/vllm/requirements-cpu.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
-torch==2.9.1+cpu
+torch==2.12.1+xpu
 torchvision
 torchaudio
 transformers
--- a/core/application/distributed.go
+++ b/core/application/distributed.go
@@ -356,12 +356,6 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
 		PrefixConfig:     prefixCfg,
 		Pressure:         pressure,
 		SharedModels:     cfg.Distributed.SharedModels,
-		// Cap how long a cold load may hold the per-model advisory lock: the
-		// configured backend.install deadline plus a margin for file staging and
-		// the remote LoadModel. Derived from the install timeout so raising it
-		// (for slow links pulling multi-GB images) widens the ceiling too,
-		// instead of letting the static default cut a legitimately slow load.
-		ModelLoadCeiling: cfg.Distributed.BackendInstallTimeoutOrDefault() + 10*time.Minute,
 	})

 	// Wire staging-progress broadcasting so file-staging shows up on every
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -369,7 +369,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 	}

 	for _, backend := range options.ExternalBackends {
-		if err := galleryop.InstallExternalBackend(options.Context, options.BackendGalleries, options.SystemState, application.ModelLoader(), nil, backend, "", "", false, options.RequireBackendIntegrity); err != nil {
+		if err := galleryop.InstallExternalBackend(options.Context, options.BackendGalleries, options.SystemState, application.ModelLoader(), nil, backend, "", "", options.RequireBackendIntegrity); err != nil {
 			xlog.Error("error installing external backend", "error", err)
 		}
 	}
--- a/core/cli/backends.go
+++ b/core/cli/backends.go
@@ -127,7 +127,7 @@ func (bi *BackendsInstall) Run(ctx *cliContext.Context) error {
 	}

 	modelLoader := model.NewModelLoader(systemState)
-	err = galleryop.InstallExternalBackend(context.Background(), galleries, systemState, modelLoader, progressCallback, bi.BackendArgs, bi.Name, bi.Alias, false, bi.RequireBackendIntegrity)
+	err = galleryop.InstallExternalBackend(context.Background(), galleries, systemState, modelLoader, progressCallback, bi.BackendArgs, bi.Name, bi.Alias, bi.RequireBackendIntegrity)
 	if err != nil {
 		return err
 	}
--- a/core/http/endpoints/localai/backend.go
+++ b/core/http/endpoints/localai/backend.go
@@ -65,10 +65,6 @@ type BackendEndpointService struct {

 type GalleryBackend struct {
 	ID string `json:"id"`
-	// Force reinstalls the backend even when it is already installed and
-	// runnable. Off by default so apply stays idempotent for supervising
-	// apps that ensure their backend on every boot.
-	Force bool `json:"force"`
 }

 func CreateBackendEndpointService(galleries []config.Gallery, systemState *system.SystemState, backendApplier *galleryop.GalleryService, upgradeChecker UpgradeInfoProvider) BackendEndpointService {
@@ -107,9 +103,7 @@ func (mgs *BackendEndpointService) GetAllStatusEndpoint() echo.HandlerFunc {
 	}
 }

-// ApplyBackendEndpoint installs a new backend to a LocalAI instance. The op is
-// idempotent: an already-installed, runnable backend is left alone unless the
-// request sets "force": true (explicit reinstall).
+// ApplyBackendEndpoint installs a new backend to a LocalAI instance
 // @Summary Install backends to LocalAI.
 // @Tags backends
 // @Param request body GalleryBackend true "query params"
@@ -143,7 +137,6 @@ func (mgs *BackendEndpointService) ApplyBackendEndpoint(systemState *system.Syst
 			ID:                 uuid.String(),
 			GalleryElementName: input.ID,
 			Galleries:          mgs.galleries,
-			Force:              input.Force,
 		}

 		return c.JSON(200, schema.BackendResponse{ID: uuid.String(), StatusURL: fmt.Sprintf("%sbackends/jobs/%s", middleware.BaseURL(c), uuid.String())})
--- a/core/http/endpoints/localai/backend_apply_test.go
+++ b/core/http/endpoints/localai/backend_apply_test.go
@@ -1,87 +0,0 @@
-package localai_test
-
-import (
-	"net/http"
-	"net/http/httptest"
-	"os"
-	"strings"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/gallery"
-	. "github.com/mudler/LocalAI/core/http/endpoints/localai"
-	"github.com/mudler/LocalAI/core/services/galleryop"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/system"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// POST /backends/apply must be idempotent by default: supervising apps call it
-// on every boot to ensure a backend exists, and forcing a reinstall there
-// re-downloads the whole artifact each time. Reinstall stays available behind
-// the explicit force flag.
-var _ = Describe("POST /backends/apply force plumbing", func() {
-	var (
-		app     *echo.Echo
-		gs      *galleryop.GalleryService
-		tmpDir  string
-		received chan galleryop.ManagementOp[gallery.GalleryBackend, any]
-	)
-
-	BeforeEach(func() {
-		app = echo.New()
-
-		var err error
-		tmpDir, err = os.MkdirTemp("", "backends-apply-test-*")
-		Expect(err).NotTo(HaveOccurred())
-
-		systemState, err := system.GetSystemState(system.WithBackendPath(tmpDir))
-		Expect(err).NotTo(HaveOccurred())
-		appConfig := &config.ApplicationConfig{SystemState: systemState}
-
-		// The service is deliberately not started: the test reads the op off
-		// the (unbuffered) channel itself.
-		gs = galleryop.NewGalleryService(appConfig, model.NewModelLoader(systemState))
-		svc := CreateBackendEndpointService(nil, systemState, gs, nil)
-		app.POST("/backends/apply", svc.ApplyBackendEndpoint(systemState))
-
-		received = make(chan galleryop.ManagementOp[gallery.GalleryBackend, any], 1)
-		go func() {
-			op := <-gs.BackendGalleryChannel
-			received <- op
-		}()
-	})
-
-	AfterEach(func() {
-		Expect(os.RemoveAll(tmpDir)).To(Succeed())
-	})
-
-	apply := func(body string) *httptest.ResponseRecorder {
-		req := httptest.NewRequest(http.MethodPost, "/backends/apply", strings.NewReader(body))
-		req.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON)
-		rec := httptest.NewRecorder()
-		app.ServeHTTP(rec, req)
-		return rec
-	}
-
-	It("enqueues a non-forced op by default", func() {
-		rec := apply(`{"id":"llama-cpp"}`)
-		Expect(rec.Code).To(Equal(http.StatusOK))
-
-		var op galleryop.ManagementOp[gallery.GalleryBackend, any]
-		Eventually(received).Should(Receive(&op))
-		Expect(op.GalleryElementName).To(Equal("llama-cpp"))
-		Expect(op.Force).To(BeFalse())
-	})
-
-	It("enqueues a forced op when the request sets force", func() {
-		rec := apply(`{"id":"llama-cpp","force":true}`)
-		Expect(rec.Code).To(Equal(http.StatusOK))
-
-		var op galleryop.ManagementOp[gallery.GalleryBackend, any]
-		Eventually(received).Should(Receive(&op))
-		Expect(op.GalleryElementName).To(Equal("llama-cpp"))
-		Expect(op.Force).To(BeTrue())
-	})
-})
--- a/core/http/routes/ui_api.go
+++ b/core/http/routes/ui_api.go
@@ -1243,9 +1243,6 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 			Galleries:          appConfig.BackendGalleries,
 			Context:            ctx,
 			CancelFunc:         cancelFunc,
-			// The React UI's "Reinstall backend" action reuses this route, so
-			// the op must force even when the backend is already installed.
-			Force: true,
 		}
 		// Store cancellation function immediately so queued operations can be cancelled
 		galleryService.StoreCancellation(uid, cancelFunc)
--- a/core/services/advisorylock/advisorylock.go
+++ b/core/services/advisorylock/advisorylock.go
@@ -6,39 +6,10 @@ import (
 	"hash/fnv"
 	"strings"
 	"sync"
-	"time"

 	"gorm.io/gorm"
 )

-// advisoryLockWaitBackstop bounds, server-side, how long we will wait to
-// acquire a blocking advisory lock when the caller's context carries no
-// deadline (e.g. a startup schema migration using context.Background()). It
-// only exists so such a caller cannot hang forever behind a holder whose
-// session never releases the lock; it is far longer than any legitimate
-// guarded section. A var (not const) so tests can shrink it.
-var advisoryLockWaitBackstop = 30 * time.Minute
-
-// advisoryLockTimeoutMargin is added to a context's remaining budget when
-// deriving the server-side lock_timeout, so the Go context's own (cleaner)
-// cancellation fires first and the server bound is only ever a backstop.
-const advisoryLockTimeoutMargin = 30 * time.Second
-
-// advisoryLockWaitBudget returns the server-side lock_timeout to use for a
-// blocking acquire: the caller context's remaining time plus a margin (so the
-// Go context still governs), or the backstop when the context has no deadline.
-// Never returns zero - "wait forever" must not be possible.
-func advisoryLockWaitBudget(ctx context.Context) time.Duration {
-	if dl, ok := ctx.Deadline(); ok {
-		budget := time.Until(dl) + advisoryLockTimeoutMargin
-		if budget < time.Second {
-			budget = time.Second
-		}
-		return budget
-	}
-	return advisoryLockWaitBackstop
-}
-
 // localLocks holds one buffered channel (capacity 1) per lock key, used as an
 // in-process mutex for non-PostgreSQL dialects (SQLite). A SQLite auth DB is
 // effectively single-process, so serializing guarded sections within this
@@ -159,27 +130,6 @@ func WithLockCtx(ctx context.Context, db *gorm.DB, key int64, fn func() error) e
 	}
 	defer conn.Close()

-	// Override any deployment-wide lock_timeout on this dedicated connection.
-	// Operators commonly set a short global lock_timeout (on the role or
-	// database) to bound ordinary row-lock waits. Applied to the blocking
-	// pg_advisory_lock below, it aborts the wait with SQLSTATE 55P03 and turns
-	// LocalAI's intentional cross-replica "wait your turn, then re-check"
-	// coordination into a hard error for the caller (e.g. a chat request that
-	// just wanted to reuse a model another replica is loading).
-	//
-	// We do NOT disable it outright (lock_timeout = 0 would wait forever, which
-	// is unsafe for the schema-migration callers that pass context.Background()).
-	// Instead we set a bound derived from the caller's context: its remaining
-	// budget plus a margin so the Go context's cancellation wins with a clean
-	// error, or a finite backstop when the context has no deadline.
-	waitBudget := advisoryLockWaitBudget(ctx)
-	if _, err := conn.ExecContext(ctx,
-		fmt.Sprintf("SET lock_timeout = %d", waitBudget.Milliseconds())); err != nil {
-		return fmt.Errorf("advisorylock: setting lock_timeout: %w", err)
-	}
-	// Restore the session default before this pooled connection is reused.
-	defer func() { _, _ = conn.ExecContext(context.Background(), "RESET lock_timeout") }()
-
 	if _, err := conn.ExecContext(ctx, "SELECT pg_advisory_lock($1)", key); err != nil {
 		return fmt.Errorf("advisorylock: acquiring lock %d: %w", key, err)
 	}
--- a/core/services/advisorylock/advisorylock_test.go
+++ b/core/services/advisorylock/advisorylock_test.go
@@ -158,87 +158,6 @@ var _ = Describe("AdvisoryLock", func() {
 			Expect(err).To(HaveOccurred())
 		})

-		It("waits out a short server-side lock_timeout instead of failing with 55P03", func() {
-			const lockKey int64 = 703
-
-			// Reproduce the production deployment that triggered this: a short
-			// global lock_timeout set on the database. Without the fix, a waiter
-			// blocked on pg_advisory_lock() is aborted by the server after this
-			// window and surfaces SQLSTATE 55P03 ("canceling statement due to
-			// lock timeout") to the caller instead of waiting for its turn.
-			Expect(db.Exec("ALTER DATABASE testdb SET lock_timeout = '300ms'").Error).ToNot(HaveOccurred())
-			sqlDB, err := db.DB()
-			Expect(err).ToNot(HaveOccurred())
-			// Drop pooled connections so subsequent ones reconnect and inherit
-			// the new database-level lock_timeout default.
-			sqlDB.SetMaxIdleConns(0)
-
-			holding := make(chan struct{})
-			released := make(chan struct{})
-			go func() {
-				defer GinkgoRecover()
-				herr := WithLockCtx(context.Background(), db, lockKey, func() error {
-					close(holding)
-					// Hold well past the 300ms server lock_timeout.
-					time.Sleep(1 * time.Second)
-					return nil
-				})
-				Expect(herr).ToNot(HaveOccurred())
-				close(released)
-			}()
-
-			<-holding // ensure the holder owns the lock before we contend
-
-			ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
-			defer cancel()
-			executed := false
-			start := time.Now()
-			werr := WithLockCtx(ctx, db, lockKey, func() error {
-				executed = true
-				return nil
-			})
-			Expect(werr).ToNot(HaveOccurred(),
-				"waiter should wait out the in-progress hold, not fail with lock_timeout (55P03)")
-			Expect(executed).To(BeTrue())
-			Expect(time.Since(start)).To(BeNumerically(">=", 400*time.Millisecond),
-				"waiter should have actually waited for the holder to release")
-			<-released
-		})
-
-		It("bounds a deadline-less waiter with the backstop instead of waiting forever", func() {
-			const lockKey int64 = 704
-
-			// A caller with no context deadline (e.g. startup schema migration
-			// passing context.Background()) must not hang forever if the holder
-			// never releases. Shrink the backstop so the test is fast.
-			origBackstop := advisoryLockWaitBackstop
-			advisoryLockWaitBackstop = 500 * time.Millisecond
-			DeferCleanup(func() { advisoryLockWaitBackstop = origBackstop })
-
-			holding := make(chan struct{})
-			release := make(chan struct{})
-			go func() {
-				defer GinkgoRecover()
-				_ = WithLockCtx(context.Background(), db, lockKey, func() error {
-					close(holding)
-					<-release // hold until the test releases us
-					return nil
-				})
-			}()
-			defer close(release)
-
-			<-holding
-
-			start := time.Now()
-			err := WithLockCtx(context.Background(), db, lockKey, func() error {
-				Fail("waiter should not have acquired the still-held lock")
-				return nil
-			})
-			Expect(err).To(HaveOccurred(), "deadline-less waiter should give up at the backstop, not hang")
-			Expect(time.Since(start)).To(BeNumerically("<", 5*time.Second),
-				"backstop must cap the wait well under the test timeout")
-		})
-
 		It("serializes concurrent WithLockCtx on same key", func() {
 			const lockKey int64 = 702

--- a/core/services/galleryop/backend_force_test.go
+++ b/core/services/galleryop/backend_force_test.go
@@ -1,114 +0,0 @@
-package galleryop_test
-
-import (
-	"context"
-	"os"
-	"path/filepath"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/gallery"
-	"github.com/mudler/LocalAI/core/services/galleryop"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/system"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-	"gopkg.in/yaml.v3"
-)
-
-// The install op must be idempotent unless Force is set: API clients call
-// POST /backends/apply on every boot to make sure the backend exists, and an
-// unconditional force here re-downloads the whole backend artifact each time.
-// Reinstall is an explicit, opted-in action.
-var _ = Describe("LocalBackendManager force semantics", func() {
-	var (
-		backendsDir string
-		srcDir      string
-		mgr         *galleryop.LocalBackendManager
-		systemState *system.SystemState
-		ml          *model.ModelLoader
-	)
-
-	const installedRunSh = "#!/bin/sh\necho installed\n"
-	const galleryRunSh = "#!/bin/sh\necho from-gallery\n"
-
-	installedRunShPath := func() string {
-		return filepath.Join(backendsDir, "test-backend", "run.sh")
-	}
-
-	BeforeEach(func() {
-		var err error
-		backendsDir, err = os.MkdirTemp("", "force-backends-*")
-		Expect(err).NotTo(HaveOccurred())
-		srcDir, err = os.MkdirTemp("", "force-src-*")
-		Expect(err).NotTo(HaveOccurred())
-
-		// The gallery serves test-backend from a plain directory (offline).
-		// The gallery yaml itself must live under the backends path: file://
-		// galleries outside the trusted root are rejected by the downloader.
-		Expect(os.WriteFile(filepath.Join(srcDir, "run.sh"), []byte(galleryRunSh), 0o755)).To(Succeed())
-		entries := []map[string]any{{"name": "test-backend", "uri": srcDir}}
-		data, err := yaml.Marshal(entries)
-		Expect(err).NotTo(HaveOccurred())
-		galleryYAML := filepath.Join(backendsDir, "gallery.yaml")
-		Expect(os.WriteFile(galleryYAML, data, 0o644)).To(Succeed())
-
-		// test-backend is already installed, with content that differs from
-		// the gallery's so a reinstall is observable.
-		Expect(os.MkdirAll(filepath.Join(backendsDir, "test-backend"), 0o755)).To(Succeed())
-		Expect(os.WriteFile(installedRunShPath(), []byte(installedRunSh), 0o755)).To(Succeed())
-
-		systemState, err = system.GetSystemState(system.WithBackendPath(backendsDir))
-		Expect(err).NotTo(HaveOccurred())
-		appConfig := &config.ApplicationConfig{
-			SystemState:      systemState,
-			BackendGalleries: []config.Gallery{{Name: "test", URL: "file://" + galleryYAML}},
-		}
-		ml = model.NewModelLoader(systemState)
-		mgr = galleryop.NewLocalBackendManager(appConfig, ml)
-	})
-
-	AfterEach(func() {
-		Expect(os.RemoveAll(backendsDir)).To(Succeed())
-		Expect(os.RemoveAll(srcDir)).To(Succeed())
-	})
-
-	It("skips an already-installed backend when Force is not set", func() {
-		op := &galleryop.ManagementOp[gallery.GalleryBackend, any]{
-			ID:                 "op-1",
-			GalleryElementName: "test-backend",
-		}
-		Expect(mgr.InstallBackend(context.Background(), op, nil)).To(Succeed())
-
-		content, err := os.ReadFile(installedRunShPath())
-		Expect(err).NotTo(HaveOccurred())
-		Expect(string(content)).To(Equal(installedRunSh), "install without Force must not overwrite an installed backend")
-	})
-
-	It("reinstalls an already-installed backend when Force is set", func() {
-		op := &galleryop.ManagementOp[gallery.GalleryBackend, any]{
-			ID:                 "op-2",
-			GalleryElementName: "test-backend",
-			Force:              true,
-		}
-		Expect(mgr.InstallBackend(context.Background(), op, nil)).To(Succeed())
-
-		content, err := os.ReadFile(installedRunShPath())
-		Expect(err).NotTo(HaveOccurred())
-		Expect(string(content)).To(Equal(galleryRunSh), "install with Force must overwrite the installed backend")
-	})
-
-	// The LOCALAI_EXTERNAL_BACKENDS boot loop goes through
-	// InstallExternalBackend's gallery-name path on EVERY startup; it must not
-	// force, or each boot re-downloads every listed backend.
-	It("skips an already-installed backend on the non-forced external gallery-name path", func() {
-		err := galleryop.InstallExternalBackend(context.Background(),
-			[]config.Gallery{{Name: "test", URL: "file://" + filepath.Join(backendsDir, "gallery.yaml")}},
-			systemState, ml, nil, "test-backend", "", "", false, false)
-		Expect(err).NotTo(HaveOccurred())
-
-		content, err := os.ReadFile(installedRunShPath())
-		Expect(err).NotTo(HaveOccurred())
-		Expect(string(content)).To(Equal(installedRunSh), "non-forced external install must not overwrite an installed backend")
-	})
-})
--- a/core/services/galleryop/backends.go
+++ b/core/services/galleryop/backends.go
@@ -144,12 +144,7 @@ func (g *GalleryService) backendHandler(op *ManagementOp[gallery.GalleryBackend,
 // InstallExternalBackend installs a backend from an external source (OCI image, URL, or path).
 // This method contains the logic to detect the input type and call the appropriate installation function.
 // It can be used by both CLI and Web UI for installing backends from external sources.
-//
-// force applies only to the gallery-name fallback: a URI install (dir/OCI/file)
-// always writes, but a bare gallery name is an "ensure installed" — the
-// LOCALAI_EXTERNAL_BACKENDS boot loop runs it on every start and must not
-// re-download an installed, runnable backend.
-func InstallExternalBackend(ctx context.Context, galleries []config.Gallery, systemState *system.SystemState, modelLoader *model.ModelLoader, downloadStatus func(string, string, string, float64), backend, name, alias string, force, requireIntegrity bool) error {
+func InstallExternalBackend(ctx context.Context, galleries []config.Gallery, systemState *system.SystemState, modelLoader *model.ModelLoader, downloadStatus func(string, string, string, float64), backend, name, alias string, requireIntegrity bool) error {
 	uri := downloader.URI(backend)
 	switch {
 	case uri.LooksLikeDir():
@@ -207,7 +202,7 @@ func InstallExternalBackend(ctx context.Context, galleries []config.Gallery, sys
 		if name != "" || alias != "" {
 			return fmt.Errorf("specifying a name or alias is not supported for gallery backends")
 		}
-		err := gallery.InstallBackendFromGallery(ctx, galleries, systemState, modelLoader, backend, downloadStatus, force, requireIntegrity)
+		err := gallery.InstallBackendFromGallery(ctx, galleries, systemState, modelLoader, backend, downloadStatus, true, requireIntegrity)
 		if err != nil {
 			return fmt.Errorf("error installing backend %s: %w", backend, err)
 		}
--- a/core/services/galleryop/backends_test.go
+++ b/core/services/galleryop/backends_test.go
@@ -70,7 +70,6 @@ var _ = Describe("InstallExternalBackend", func() {
 				"test-backend", // gallery name
 				"custom-name",  // name should not be allowed
 				"",
-				false, // force
 				false,
 			)
 			Expect(err).To(HaveOccurred())
@@ -87,7 +86,6 @@ var _ = Describe("InstallExternalBackend", func() {
 				"non-existent-backend",
 				"",
 				"",
-				false, // force
 				false,
 			)
 			Expect(err).To(HaveOccurred())
@@ -105,7 +103,6 @@ var _ = Describe("InstallExternalBackend", func() {
 				"oci://quay.io/mudler/tests:localai-backend-test",
 				"", // name is required for OCI images
 				"",
-				false, // force
 				false,
 			)
 			Expect(err).To(HaveOccurred())
@@ -139,7 +136,6 @@ var _ = Describe("InstallExternalBackend", func() {
 				testBackendPath,
 				"", // name should be inferred as "source-backend"
 				"",
-				false, // force
 				false,
 			)
 			// The function should at least attempt to install with the inferred name
@@ -159,7 +155,6 @@ var _ = Describe("InstallExternalBackend", func() {
 				testBackendPath,
 				"custom-backend-name",
 				"",
-				false, // force
 				false,
 			)
 			// The function should use the provided name
@@ -178,7 +173,6 @@ var _ = Describe("InstallExternalBackend", func() {
 				testBackendPath,
 				"custom-backend-name",
 				"custom-alias",
-				false, // force
 				false,
 			)
 			// The function should accept alias for directory paths
--- a/core/services/galleryop/managers_local.go
+++ b/core/services/galleryop/managers_local.go
@@ -110,13 +110,10 @@ func (b *LocalBackendManager) CheckUpgrades(ctx context.Context) (map[string]gal
 func (b *LocalBackendManager) InstallBackend(ctx context.Context, op *ManagementOp[gallery.GalleryBackend, any], progressCb ProgressCallback) error {
 	if op.ExternalURI != "" {
 		return InstallExternalBackend(ctx, b.backendGalleries, b.systemState, b.modelLoader,
-			progressCb, op.ExternalURI, op.ExternalName, op.ExternalAlias, op.Force, b.requireBackendIntegrity)
+			progressCb, op.ExternalURI, op.ExternalName, op.ExternalAlias, b.requireBackendIntegrity)
 	}
-	// op.Force distinguishes an explicit reinstall from an idempotent
-	// "make sure it's installed" op; the latter must not re-download an
-	// already-runnable backend (supervisors apply on every boot).
 	return gallery.InstallBackendFromGallery(ctx, b.backendGalleries, b.systemState,
-		b.modelLoader, op.GalleryElementName, progressCb, op.Force, b.requireBackendIntegrity)
+		b.modelLoader, op.GalleryElementName, progressCb, true, b.requireBackendIntegrity)
 }

 func (b *LocalBackendManager) IsDistributed() bool { return false }
--- a/core/services/galleryop/operation.go
+++ b/core/services/galleryop/operation.go
@@ -45,13 +45,6 @@ type ManagementOp[T any, E any] struct {

 	// Upgrade is true if this is an upgrade operation (not a fresh install)
 	Upgrade bool
-
-	// Force reinstalls a backend even when it is already installed and
-	// runnable. Without it a backend install op is idempotent — API clients
-	// that ensure a backend exists on every boot must not trigger a full
-	// artifact re-download each time. The UI's explicit "Reinstall backend"
-	// action sets it.
-	Force bool
 }

 type OpStatus struct {
--- a/core/services/nodes/router.go
+++ b/core/services/nodes/router.go
@@ -68,13 +68,6 @@ type SmartRouterOptions struct {
 	// the absolute model paths untouched so the worker loads them directly from
 	// the shared volume (#10556). See config.DistributedConfig.SharedModels.
 	SharedModels bool
-	// ModelLoadCeiling is the hard upper bound on how long a single cold-load
-	// attempt (node selection -> backend install -> file staging -> LoadModel)
-	// may run while holding the per-model advisory lock. It backstops every
-	// sub-step's own timeout so a wedged worker can never pin the lock - and
-	// every other replica's request for that model - indefinitely. Zero selects
-	// defaultModelLoadCeiling.
-	ModelLoadCeiling time.Duration
 }

 // SmartRouter routes inference requests to the best available backend node.
@@ -108,18 +101,8 @@ type SmartRouter struct {
 	// sharedModels skips file staging when all nodes mount the same models
 	// directory at the same path (see SmartRouterOptions.SharedModels).
 	sharedModels bool
-	// modelLoadCeiling bounds how long a cold load may hold the per-model
-	// advisory lock (see SmartRouterOptions.ModelLoadCeiling).
-	modelLoadCeiling time.Duration
 }

-// defaultModelLoadCeiling is the fallback hold ceiling for a cold model load.
-// It must comfortably exceed the slowest legitimate load - a multi-GB backend
-// install (DefaultBackendInstallTimeout, 15m) plus staging and the remote
-// LoadModel (5m) - so it never cuts a real load short; it only ever fires when
-// a step is genuinely wedged (e.g. a worker that died mid-install).
-const defaultModelLoadCeiling = 25 * time.Minute
-
 // probeCacheTTL is how long a successful gRPC HealthCheck on a backend is
 // trusted before the next request re-probes. Matches healthCheckTTL in
 // pkg/model/model.go so the single-process and distributed paths share a
@@ -134,10 +117,6 @@ func NewSmartRouter(registry ModelRouter, opts SmartRouterOptions) *SmartRouter
 	if factory == nil {
 		factory = &tokenClientFactory{token: opts.AuthToken}
 	}
-	ceiling := opts.ModelLoadCeiling
-	if ceiling <= 0 {
-		ceiling = defaultModelLoadCeiling
-	}
 	return &SmartRouter{
 		registry:         registry,
 		unloader:         opts.Unloader,
@@ -152,7 +131,6 @@ func NewSmartRouter(registry ModelRouter, opts SmartRouterOptions) *SmartRouter
 		prefixConfig:     opts.PrefixConfig,
 		pressure:         opts.Pressure,
 		sharedModels:     opts.SharedModels,
-		modelLoadCeiling: ceiling,
 	}
 }

@@ -405,19 +383,11 @@ func (r *SmartRouter) Route(ctx context.Context, modelID, modelName, backendType
 	// the request context. If staging were bound to it, the multi-GB upload
 	// aborts with "context canceled" mid-transfer and large models can never
 	// finish staging (the model-load outage). WithoutCancel keeps the request's
-	// values (prefix chain, etc.) but drops its cancellation/deadline.
-	//
-	// Detaching from the caller is necessary, but it must not be unbounded: the
-	// load runs while holding the per-model advisory lock, and a worker that
-	// dies mid-install (its backend.install never replies) would otherwise pin
-	// that lock (and every other replica's request for the same model) until
-	// the NATS install deadline alone expires. Re-impose a single hard ceiling
-	// over the whole sequence so the lock is always released in bounded time,
-	// even if a sub-step wedges. Each long step still has its own (tighter)
-	// bound; this only backstops them. The per-model advisory lock below
-	// de-dupes concurrent loaders across replicas.
-	loadCtx, cancelLoad := context.WithTimeout(context.WithoutCancel(ctx), r.modelLoadCeiling)
-	defer cancelLoad()
+	// values (prefix chain, etc.) but drops its cancellation/deadline. Each
+	// long step still has its own bound (the file stager's resume budget,
+	// LoadModel's 5m timeout), and the per-model advisory lock below de-dupes
+	// concurrent loaders across replicas.
+	loadCtx := context.WithoutCancel(ctx)
 	loadModel := func(ctx context.Context) (*RouteResult, error) {
 		// Re-check after acquiring lock — another request may have loaded it
 		node, nm, err := r.registry.FindAndLockNodeWithModel(ctx, trackingKey, candidateNodeIDs, pref)
@@ -946,14 +916,7 @@ func (r *SmartRouter) installBackendOnNode(ctx context.Context, node *BackendNod
 	}

 	key := fmt.Sprintf("%s|%s|%s|%d", node.ID, backendType, modelID, replicaIndex)
-	// DoChan rather than Do so this wait honors ctx cancellation. InstallBackend
-	// blocks for its full NATS deadline (15m by default) when a worker accepts
-	// the request but never replies (e.g. it died mid-install). Without ctx
-	// awareness the caller (holding the per-model advisory lock) would sit there
-	// the whole time; here a cancelled ctx (typically the model-load ceiling)
-	// frees the caller promptly. The shared install keeps running in the
-	// background and still coalesces other callers via singleflight.
-	resCh := r.installFlight.DoChan(key, func() (any, error) {
+	v, err, _ := r.installFlight.Do(key, func() (any, error) {
 		reply, err := r.unloader.InstallBackend(node.ID, backendType, modelID, r.galleriesJSON, "", "", "", replicaIndex, "", nil)
 		if err != nil {
 			return "", err
@@ -968,15 +931,10 @@ func (r *SmartRouter) installBackendOnNode(ctx context.Context, node *BackendNod
 		}
 		return addr, nil
 	})
-	select {
-	case <-ctx.Done():
-		return "", ctx.Err()
-	case res := <-resCh:
-		if res.Err != nil {
-			return "", res.Err
-		}
-		return res.Val.(string), nil
+	if err != nil {
+		return "", err
 	}
+	return v.(string), nil
 }

 func (r *SmartRouter) buildClientForAddr(node *BackendNode, addr string, parallel bool) grpc.Backend {
--- a/core/services/nodes/router_test.go
+++ b/core/services/nodes/router_test.go
@@ -493,44 +493,6 @@ var _ = Describe("SmartRouter", func() {
 				Expect(result.Node.ID).To(Equal("n3"))
 			})
 		})
-
-		Context("worker wedges mid-install (dead node holding the lock)", func() {
-			It("aborts the load at the ModelLoadCeiling instead of blocking forever", func() {
-				// Simulate the production incident: the chosen worker accepts the
-				// backend.install but never replies (it died), so InstallBackend
-				// would otherwise block for its full NATS deadline (15m by
-				// default) while pinning the per-model advisory lock. Route must
-				// give up at the ceiling so the lock is released promptly.
-				reg.findAndLockErr = errors.New("not found")
-				reg.findIdleNode = &BackendNode{ID: "n4", Name: "dead-node", Address: "10.0.0.4:50051"}
-
-				block := make(chan struct{})
-				defer close(block) // let the background install goroutine drain at test end
-				unloader.installHook = func() { <-block }
-
-				router := NewSmartRouter(reg, SmartRouterOptions{
-					Unloader:         unloader,
-					ClientFactory:    factory,
-					ModelLoadCeiling: 200 * time.Millisecond,
-				})
-
-				done := make(chan error, 1)
-				start := time.Now()
-				go func() {
-					defer GinkgoRecover()
-					_, err := router.Route(context.Background(), "wedged-model",
-						"models/wedged.gguf", "llama-cpp",
-						&pb.ModelOptions{Model: "models/wedged.gguf"}, false)
-					done <- err
-				}()
-
-				var routeErr error
-				Eventually(done, 5*time.Second).Should(Receive(&routeErr),
-					"Route must not block on a wedged install past the ceiling")
-				Expect(routeErr).To(HaveOccurred())
-				Expect(time.Since(start)).To(BeNumerically("<", 5*time.Second))
-			})
-		})
 	})

 	Describe("scheduleNewModel (mock-based, via Route)", func() {
--- a/core/services/worker/install.go
+++ b/core/services/worker/install.go
@@ -134,7 +134,7 @@ func (s *backendSupervisor) installBackend(req messaging.BackendInstallRequest,
 		if req.URI != "" {
 			xlog.Info("Installing backend from external URI", "backend", req.Backend, "uri", req.URI, "force", force)
 			if err := galleryop.InstallExternalBackend(
-				context.Background(), galleries, s.systemState, s.ml, downloadCb, req.URI, req.Name, req.Alias, force, s.cfg.RequireBackendIntegrity,
+				context.Background(), galleries, s.systemState, s.ml, downloadCb, req.URI, req.Name, req.Alias, s.cfg.RequireBackendIntegrity,
 			); err != nil {
 				return "", fmt.Errorf("installing backend from gallery: %w", err)
 			}
@@ -201,7 +201,7 @@ func (s *backendSupervisor) upgradeBackend(req messaging.BackendUpgradeRequest)
 	if req.URI != "" {
 		xlog.Info("Upgrading backend from external URI", "backend", req.Backend, "uri", req.URI)
 		if err := galleryop.InstallExternalBackend(
-			context.Background(), galleries, s.systemState, s.ml, downloadCb, req.URI, req.Name, req.Alias, true, s.cfg.RequireBackendIntegrity,
+			context.Background(), galleries, s.systemState, s.ml, downloadCb, req.URI, req.Name, req.Alias, s.cfg.RequireBackendIntegrity,
 		); err != nil {
 			return fmt.Errorf("upgrading backend from external URI: %w", err)
 		}
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v4.5.6"
+  "version": "v4.5.5"
 }
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1758,8 +1758,8 @@
      use_tokenizer_template: true
  files:
    - filename: llama-cpp/models/Qwopus3.5-9B-Coder-MTP-GGUF/Qwopus3.5-9B-Coder-MTP-Q4_K_M.gguf
+      sha256: f6fc5d193045796d9e1870cbc40f827fe55f53f70593c3f5c1968b82b9331991
      uri: https://huggingface.co/Jackrong/Qwopus3.5-9B-Coder-MTP-GGUF/resolve/main/Qwopus3.5-9B-Coder-MTP-Q4_K_M.gguf
-      sha256: 9ea3ecd122a5165b8b81655f29eaf09d71daf841503e4c4212bdfadb36ab3712
    - filename: llama-cpp/mmproj/Qwopus3.5-9B-Coder-MTP-GGUF/Qwopus3.5-9B-Coder-MTP-mmproj.gguf
      sha256: f48daca405a1c768a9514e392c3955dcc4a9d66a5cf64cf45e064092b5f20ee4
      uri: https://huggingface.co/Jackrong/Qwopus3.5-9B-Coder-MTP-GGUF/resolve/main/Qwopus3.5-9B-Coder-MTP-mmproj.gguf
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -3605,10 +3605,6 @@ const docTemplate = `{
        "localai.GalleryBackend": {
            "type": "object",
            "properties": {
-                "force": {
-                    "description": "Force reinstalls the backend even when it is already installed and\nrunnable. Off by default so apply stays idempotent for supervising\napps that ensure their backend on every boot.",
-                    "type": "boolean"
-                },
                "id": {
                    "type": "string"
                }
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -3602,10 +3602,6 @@
        "localai.GalleryBackend": {
            "type": "object",
            "properties": {
-                "force": {
-                    "description": "Force reinstalls the backend even when it is already installed and\nrunnable. Off by default so apply stays idempotent for supervising\napps that ensure their backend on every boot.",
-                    "type": "boolean"
-                },
                "id": {
                    "type": "string"
                }
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -303,12 +303,6 @@ definitions:
    type: object
  localai.GalleryBackend:
    properties:
-      force:
-        description: |-
-          Force reinstalls the backend even when it is already installed and
-          runnable. Off by default so apply stays idempotent for supervising
-          apps that ensure their backend on every boot.
-        type: boolean
      id:
        type: string
    type: object