diff --git a/pkg/grpc/parentwatch.go b/pkg/grpc/parentwatch.go new file mode 100644 index 000000000..f6a8e2009 --- /dev/null +++ b/pkg/grpc/parentwatch.go @@ -0,0 +1,105 @@ +package grpc + +import ( + "log" + "os" + "runtime" + "strings" + "time" +) + +// Backend worker processes (the per-model gRPC servers LocalAI spawns) are +// deliberately placed in their own process group by the process manager so +// LocalAI's graceful shutdown can signal the whole group. That graceful path +// (SIGTERM -> grace -> SIGKILL, driven by pkg/signals + pkg/model) only runs +// when LocalAI itself receives a catchable signal and lives long enough to run +// its handlers. If LocalAI is SIGKILLed (e.g. a supervising process's +// graceful-shutdown grace period elapses first), that teardown never runs and +// this backend would be reparented to init and linger, holding VRAM and its +// listen port. +// +// The watcher below is a best-effort backstop for exactly that case: it does +// NOT replace the graceful teardown, it only covers the "parent vanished +// without cleaning up" path. It works by detecting reparenting: when the +// process that spawned this backend dies, the kernel reparents us to the +// nearest sub-reaper or to init (PID 1), so getppid() stops matching the value +// we captured at startup. This getppid() approach is portable across +// Linux/macOS (unlike Linux-only PR_SET_PDEATHSIG), which is why it's used +// here rather than a kernel parent-death signal. +const ( + // EnvBackendParentWatch toggles the parent-death watcher. It is enabled by + // default; set it to a falsey value ("false", "0", "no", "off") to disable + // (e.g. when running a backend standalone for debugging under a shell whose + // lifetime shouldn't govern the backend). + EnvBackendParentWatch = "LOCALAI_BACKEND_PARENT_WATCH" + // EnvBackendParentWatchInterval overrides the poll interval as a Go + // duration string (e.g. "500ms"). Defaults to defaultParentWatchInterval. + EnvBackendParentWatchInterval = "LOCALAI_BACKEND_PARENT_WATCH_INTERVAL" + + defaultParentWatchInterval = 2 * time.Second +) + +// parentWatchEnabled reports whether the watcher should run in this process. +func parentWatchEnabled() bool { + switch strings.ToLower(strings.TrimSpace(os.Getenv(EnvBackendParentWatch))) { + case "false", "0", "no", "off": + return false + } + // Windows does not reparent orphans to a well-known init PID, so the + // getppid() heuristic used here doesn't apply there. + return runtime.GOOS != "windows" +} + +// parentWatchInterval returns the configured poll interval, or the default. +func parentWatchInterval() time.Duration { + if v := os.Getenv(EnvBackendParentWatchInterval); v != "" { + if d, err := time.ParseDuration(v); err == nil && d > 0 { + return d + } + } + return defaultParentWatchInterval +} + +// parentDied reports whether this process has been reparented away from the +// parent it had when the watcher started. Reparenting is the standard POSIX +// signal that the original parent (here, the LocalAI process that spawned this +// backend) has exited: the orphan is handed to the nearest sub-reaper or to +// init (PID 1), so getppid() no longer matches the value captured at startup. +func parentDied(origPPID int) bool { + ppid := os.Getppid() + return ppid != origPPID || ppid == 1 +} + +// watchParentDeath polls until parentDied reports the original parent is gone, +// then invokes onDeath. It blocks, so run it in its own goroutine. +func watchParentDeath(origPPID int, interval time.Duration, onDeath func()) { + ticker := time.NewTicker(interval) + defer ticker.Stop() + for range ticker.C { + if parentDied(origPPID) { + onDeath() + return + } + } +} + +// startParentDeathWatcher installs the best-effort safety net described above +// on the calling backend process. It is a no-op when disabled or on platforms +// where the mechanism doesn't apply. This is a backstop alongside — never a +// replacement for — LocalAI's graceful SIGTERM->grace->SIGKILL teardown. +func startParentDeathWatcher() { + if !parentWatchEnabled() { + return + } + origPPID := os.Getppid() + // A parent of 1 at startup means we were already orphaned (or launched + // directly under init) — there's no original parent to watch for. + if origPPID <= 1 { + return + } + interval := parentWatchInterval() + go watchParentDeath(origPPID, interval, func() { + log.Printf("backend parent process (pid %d) exited without stopping this backend; self-terminating to avoid orphaning", origPPID) + os.Exit(1) + }) +} diff --git a/pkg/grpc/parentwatch_test.go b/pkg/grpc/parentwatch_test.go new file mode 100644 index 000000000..b3af0c79d --- /dev/null +++ b/pkg/grpc/parentwatch_test.go @@ -0,0 +1,163 @@ +//go:build !windows + +package grpc + +import ( + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "syscall" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// These env vars drive the helper roles this test binary re-executes itself as +// (see the init() dispatcher). They are only set for the spawned child/ +// grandchild processes, never for the normal `go test` invocation. +const ( + envRole = "LOCALAI_PARENTWATCH_TEST_ROLE" + envReady = "LOCALAI_PARENTWATCH_TEST_READY" // grandchild writes its PID here once the watcher is armed + envExited = "LOCALAI_PARENTWATCH_TEST_EXITED" // grandchild writes here when it detects reparenting +) + +// init dispatches the helper roles when this test binary is re-executed with a +// role set. It runs before the testing/Ginkgo machinery, and is a no-op during +// a normal test run (role unset). +func init() { + switch os.Getenv(envRole) { + case "middle": + runMiddleRole() + case "grandchild": + runGrandchildRole() + } +} + +// childEnv returns the current environment with the parentwatch test role set +// to the given value (replacing any inherited role), leaving the ready/exited +// file paths inherited. +func childEnv(role string) []string { + out := make([]string, 0, len(os.Environ())+1) + for _, kv := range os.Environ() { + if len(kv) > len(envRole) && kv[:len(envRole)+1] == envRole+"=" { + continue + } + out = append(out, kv) + } + return append(out, envRole+"="+role) +} + +// runGrandchildRole arms the REAL watchParentDeath against its current parent +// (the "middle" process), signals readiness, then blocks. When middle exits and +// we are reparented, the watcher fires and we record it before exiting. +func runGrandchildRole() { + exitedFile := os.Getenv(envExited) + readyFile := os.Getenv(envReady) + + origPPID := os.Getppid() + go watchParentDeath(origPPID, 50*time.Millisecond, func() { + _ = os.WriteFile(exitedFile, []byte("1"), 0o644) + os.Exit(7) + }) + + // Safety valve: never linger if something goes wrong with the test. + go func() { + time.Sleep(30 * time.Second) + os.Exit(2) + }() + + // Signal readiness only after the watcher captured origPPID, so middle + // won't exit before we've recorded it as our original parent. + _ = os.WriteFile(readyFile, []byte(strconv.Itoa(os.Getpid())), 0o644) + + select {} // block until the watcher terminates us +} + +// runMiddleRole spawns the grandchild (which arms the watcher against us), +// waits until it is ready, then exits — orphaning the grandchild so it gets +// reparented, which is what the watcher must detect. +func runMiddleRole() { + readyFile := os.Getenv(envReady) + + self, err := os.Executable() + if err != nil { + os.Exit(3) + } + cmd := exec.Command(self) + cmd.Env = childEnv("grandchild") + // Own process group, mirroring how real backends are spawned, and discard + // std streams so the grandchild doesn't keep any parent pipe open. + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + if err := cmd.Start(); err != nil { + os.Exit(4) + } + + if !waitForFile(readyFile, 10*time.Second) { + os.Exit(5) + } + os.Exit(0) // orphan the grandchild +} + +func waitForFile(path string, timeout time.Duration) bool { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if _, err := os.Stat(path); err == nil { + return true + } + time.Sleep(20 * time.Millisecond) + } + return false +} + +// This spec builds a genuine two-level process tree (test -> middle -> +// grandchild), lets the middle process die, and asserts the grandchild's +// watchParentDeath detects the reparenting and self-terminates. +var _ = Describe("watchParentDeath", func() { + It("detects reparenting and self-terminates the orphaned process", func() { + if runtime.GOOS == "windows" { + Skip("parent-death watcher is not supported on windows") + } + + dir := GinkgoT().TempDir() + readyFile := filepath.Join(dir, "ready") + exitedFile := filepath.Join(dir, "exited") + + self, err := os.Executable() + Expect(err).NotTo(HaveOccurred(), "cannot resolve test executable") + + middle := exec.Command(self) + middle.Env = append(childEnv("middle"), + envReady+"="+readyFile, + envExited+"="+exitedFile, + ) + // Discard the helpers' output; keep the test log clean. + middle.Stdout = nil + middle.Stderr = nil + + Expect(middle.Start()).To(Succeed(), "failed to start middle helper") + // Wait only for the middle process; the grandchild is intentionally left + // orphaned. No pipes are shared, so this returns as soon as middle exits. + Expect(middle.Wait()).To(Succeed(), "middle helper exited with error") + + // The grandchild must have armed the watcher (and thus captured middle as + // its parent) before middle exited. + _, err = os.Stat(readyFile) + Expect(err).NotTo(HaveOccurred(), "grandchild never signaled readiness") + + // Best-effort cleanup in case the watcher somehow doesn't fire. + DeferCleanup(func() { + if b, err := os.ReadFile(readyFile); err == nil { + if pid, err := strconv.Atoi(string(b)); err == nil { + _ = syscall.Kill(pid, syscall.SIGKILL) + } + } + }) + + // Now that middle is gone, the grandchild has been reparented; the watcher + // must notice and write the exited marker. + Expect(waitForFile(exitedFile, 10*time.Second)).To(BeTrue(), "watcher did not detect parent death within timeout") + }) +}) diff --git a/pkg/grpc/server.go b/pkg/grpc/server.go index c4c2785c8..0ed50360f 100644 --- a/pkg/grpc/server.go +++ b/pkg/grpc/server.go @@ -939,6 +939,9 @@ func StartServer(address string, model AIModel) error { s := grpc.NewServer(serverOpts()...) pb.RegisterBackendServer(s, &server{llm: model}) log.Printf("gRPC Server listening at %v", lis.Addr()) + // Safety net: self-terminate if the LocalAI process that spawned this + // backend dies without running its graceful teardown (see parentwatch.go). + startParentDeathWatcher() if err := s.Serve(lis); err != nil { return err } @@ -954,6 +957,9 @@ func RunServer(address string, model AIModel) (func() error, error) { s := grpc.NewServer(serverOpts()...) pb.RegisterBackendServer(s, &server{llm: model}) log.Printf("gRPC Server listening at %v", lis.Addr()) + // Safety net: self-terminate if the LocalAI process that spawned this + // backend dies without running its graceful teardown (see parentwatch.go). + startParentDeathWatcher() if err = s.Serve(lis); err != nil { return func() error { return lis.Close()