Files
LocalAI/pkg/model/process.go
Richard Palethorpe 4647770316 fix(model): track intentional stops, stop misreading clean shutdowns as crashes (#10060)
Two separate issues made graceful backend shutdown look ungraceful in the
logs, even though the processes were being terminated correctly
(go-processmanager defaults to process-group SIGTERM + 15s grace + SIGKILL):

1. "failed to read PID" — startProcess registers a per-process graceful-
   termination handler that calls Stop(), but StopAllGRPC (registered
   earlier, via app.Shutdown) already stopped and released store-tracked
   backends first. The second Stop() then failed reading the removed
   pidfile. Guard the handler with IsAlive() so it skips already-stopped
   processes; it still covers backends StopAllGRPC doesn't track (worker-
   supervised ones).

2. "Backend process exited unexpectedly" exitCode=-1 — the exit watcher
   treated only exit codes 0/143 as clean. But a child killed by our own
   SIGTERM/SIGKILL is reported by Go as exitCode -1 (signal termination),
   not the shell's 128+signal convention, so every intentional stop logged
   a false crash warning. The exit code can't distinguish an intended stop
   from a signal-induced crash.

Track intent directly instead: a stoppingProcs sync.Map (keyed by the
*process.Process pointer) is marked wherever LocalAI calls Stop() on
purpose, and the exit watcher uses it to pick the log level — Info
"stopped" when intentional, Warn "exited unexpectedly" otherwise (still
catching real crashes). The raw exit code is reported as a field but no
longer interpreted.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-05-29 18:54:27 +02:00

252 lines
7.8 KiB
Go

package model
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/hpcloud/tail"
"github.com/mudler/LocalAI/pkg/signals"
process "github.com/mudler/go-processmanager"
"github.com/mudler/xlog"
)
var forceBackendShutdown bool = os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true"
var (
modelNotFoundErr = errors.New("model not found")
)
func (ml *ModelLoader) deleteProcess(s string) error {
model, ok := ml.store.Get(s)
if !ok {
xlog.Debug("Model not found", "model", s)
return modelNotFoundErr
}
retries := 1
for model.GRPC(false, ml.wd).IsBusy() {
xlog.Debug("Model busy. Waiting.", "model", s)
dur := time.Duration(retries*2) * time.Second
if dur > retryTimeout {
dur = retryTimeout
}
time.Sleep(dur)
retries++
if retries > 10 && forceBackendShutdown {
xlog.Warn("Model is still busy after retries. Forcing shutdown.", "model", s, "retries", retries)
break
}
}
xlog.Debug("Deleting process", "model", s)
// Run unload hooks (e.g. close MCP sessions)
for _, hook := range ml.onUnloadHooks {
hook(s)
}
// Free GPU resources before stopping the process to ensure VRAM is released
xlog.Debug("Calling Free() to release GPU resources", "model", s)
if err := model.GRPC(false, ml.wd).Free(context.Background()); err != nil {
xlog.Warn("Error freeing GPU resources", "error", err, "model", s)
}
process := model.Process()
if process == nil {
// No local process — this is a remote/external backend.
// In distributed mode, delegate to the remote unloader to tell
// the backend node to free the model (GPU resources, etc.).
if ml.remoteUnloader != nil {
xlog.Debug("Delegating model unload to remote unloader", "model", s)
if err := ml.remoteUnloader.UnloadRemoteModel(s); err != nil {
xlog.Warn("Remote model unload failed", "model", s, "error", err)
}
} else {
xlog.Debug("No local process and no remote unloader", "model", s)
}
ml.store.Delete(s)
return nil
}
// Mark the stop as intentional so the exit-watcher logs it as an
// expected stop, not a crash (signal-terminated children report -1).
ml.stoppingProcs.Store(process, struct{}{})
err := process.Stop()
if err != nil {
xlog.Error("(deleteProcess) error while deleting process", "error", err, "model", s)
}
if err == nil {
ml.store.Delete(s)
}
return err
}
func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error {
var err error = nil
ml.mu.Lock()
defer ml.mu.Unlock()
// Collect matching keys first — can't mutate store during Range
var toDelete []string
ml.store.Range(func(k string, m *Model) bool {
if filter(k, m.Process()) {
toDelete = append(toDelete, k)
}
return true
})
for _, k := range toDelete {
e := ml.deleteProcess(k)
err = errors.Join(err, e)
}
return err
}
func (ml *ModelLoader) StopAllGRPC() error {
return ml.StopGRPC(all)
}
func (ml *ModelLoader) GetGRPCPID(id string) (int, error) {
ml.mu.Lock()
defer ml.mu.Unlock()
p, exists := ml.store.Get(id)
if !exists {
return -1, fmt.Errorf("no grpc backend found for %s", id)
}
if p.Process() == nil {
return -1, fmt.Errorf("no grpc backend found for %s", id)
}
return strconv.Atoi(p.Process().PID)
}
// StartProcess starts a gRPC backend process and returns its process handle.
// This is the public wrapper for the internal startProcess method, used by
// the serve-backend CLI subcommand to start a backend on a specified address.
func (ml *ModelLoader) StartProcess(grpcProcess, id string, serverAddress string, args ...string) (*process.Process, error) {
return ml.startProcess(grpcProcess, id, serverAddress, args...)
}
func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string, args ...string) (*process.Process, error) {
// Make sure the process is executable
// Check first if it has executable permissions
if fi, err := os.Stat(grpcProcess); err == nil {
if fi.Mode()&0111 == 0 {
xlog.Debug("Process is not executable. Making it executable.", "process", grpcProcess)
if err := os.Chmod(grpcProcess, 0700); err != nil {
return nil, err
}
}
}
xlog.Debug("Loading GRPC Process", "process", grpcProcess)
xlog.Debug("GRPC Service will be running", "id", id, "address", serverAddress)
workDir, err := filepath.Abs(filepath.Dir(grpcProcess))
if err != nil {
return nil, err
}
grpcControlProcess := process.New(
process.WithTemporaryStateDir(),
process.WithName(filepath.Base(grpcProcess)),
process.WithArgs(append(args, []string{"--addr", serverAddress}...)...),
process.WithEnvironment(os.Environ()...),
process.WithWorkDir(workDir),
)
if ml.wd != nil {
ml.wd.Add(serverAddress, grpcControlProcess)
ml.wd.AddAddressModelMap(serverAddress, id)
}
if err := grpcControlProcess.Run(); err != nil {
return grpcControlProcess, err
}
xlog.Debug("GRPC Service state dir", "dir", grpcControlProcess.StateDir())
signals.RegisterGracefulTerminationHandler(func() {
// StopAllGRPC (the deleteProcess path) is registered earlier and runs
// first for store-tracked backends, stopping this process and removing
// its pidfile. Calling Stop again then fails with "failed to read PID".
// Skip when it's already gone; this handler still covers processes that
// StopAllGRPC doesn't track (e.g. worker-supervised backends).
if !grpcControlProcess.IsAlive() {
return
}
ml.stoppingProcs.Store(grpcControlProcess, struct{}{})
if err := grpcControlProcess.Stop(); err != nil {
xlog.Error("error while shutting down grpc process", "error", err)
}
})
go func() {
t, err := tail.TailFile(grpcControlProcess.StderrPath(), tail.Config{Follow: true})
if err != nil {
xlog.Error("Could not tail stderr", "process", grpcProcess)
return
}
for line := range t.Lines {
xlog.Debug("GRPC stderr", "id", strings.Join([]string{id, serverAddress}, "-"), "line", line.Text)
if ml.backendLogs != nil && ml.backendLoggingEnabled.Load() {
ml.backendLogs.AppendLine(id, "stderr", line.Text)
}
}
}()
go func() {
t, err := tail.TailFile(grpcControlProcess.StdoutPath(), tail.Config{Follow: true})
if err != nil {
xlog.Error("Could not tail stdout", "process", grpcProcess)
return
}
for line := range t.Lines {
xlog.Debug("GRPC stdout", "id", strings.Join([]string{id, serverAddress}, "-"), "line", line.Text)
if ml.backendLogs != nil && ml.backendLoggingEnabled.Load() {
ml.backendLogs.AppendLine(id, "stdout", line.Text)
}
}
}()
// Surface backend exits in the log. Without this, a crash (SIGSEGV
// from a missing shared library, a Python ImportError, etc.) is
// invisible at every log level — the only signal is a delayed
// "connection refused" from the gRPC dial, which doesn't say
// whether the child is alive.
go func() {
<-grpcControlProcess.Done()
// LoadAndDelete both reads the intentional-stop marker and frees the
// map entry so it doesn't accumulate across the process's lifetime.
_, intentional := ml.stoppingProcs.LoadAndDelete(grpcControlProcess)
fields := []any{
"id", id,
"address", serverAddress,
"process", filepath.Base(grpcProcess),
}
// Report the raw exit code without interpreting it: a child killed by
// our own SIGTERM/SIGKILL surfaces as -1 (Go reports -1 for signal
// termination, not the shell's 128+signal convention), so the code
// alone can't tell an intended stop from a crash. The stoppingProcs
// marker is the reliable signal for that, so it picks the log level.
if code, codeErr := grpcControlProcess.ExitCode(); codeErr == nil {
fields = append(fields, "exitCode", code)
}
if intentional {
xlog.Info("Backend process stopped", fields...)
} else {
// A stop we didn't initiate — a SIGSEGV from a missing shared
// library, a Python ImportError, an OOM kill, an unexpected self-exit.
xlog.Warn("Backend process exited unexpectedly", fields...)
}
}()
return grpcControlProcess, nil
}