From e05dece93c803545e346c1a4765596695d95a788 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Mon, 22 Jun 2026 13:32:02 +0000
Subject: [PATCH] feat(recon): honor LocalAI per-model threads in
 voice/face-detect backends

LocalAI spawns one backend process per model and serves requests
concurrently, so the engines' own min(hardware_concurrency, 8) default
can oversubscribe cores. Forward the per-model Threads value from the
gRPC LoadModel options into the engine via VOICEDETECT_THREADS /
FACEDETECT_THREADS (read at backend construction) before the capi load.
A non-positive Threads is treated as unset, leaving the engine default.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]
---
 backend/go/face-detect/gofacedetect.go   | 15 +++++++++++++++
 backend/go/voice-detect/govoicedetect.go | 16 ++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/backend/go/face-detect/gofacedetect.go b/backend/go/face-detect/gofacedetect.go
index 5577a2404..4ad6c067c 100644
--- a/backend/go/face-detect/gofacedetect.go
+++ b/backend/go/face-detect/gofacedetect.go
@@ -8,6 +8,7 @@ import (
 	"math"
 	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"time"
 	"unsafe"
@@ -66,6 +67,20 @@ func (f *FaceDetect) Load(opts *pb.ModelOptions) error {
 		f.opts.modelName = filepath.Base(model)
 	}
 
+	// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
+	// one backend process per model and serves requests concurrently, so the
+	// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
+	// FACEDETECT_THREADS is read by the engine at backend construction, so it
+	// must be set before the capi load. A non-positive Threads means "unset":
+	// leave the env alone so the engine keeps its sane default.
+	threads := opts.Threads
+	if threads > 0 {
+		if err := os.Setenv("FACEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
+			return fmt.Errorf("face-detect: set FACEDETECT_THREADS: %w", err)
+		}
+		xlog.Info("face-detect: applying LocalAI thread budget", "threads", threads)
+	}
+
 	xlog.Info("face-detect: loading model", "model", model,
 		"verify_threshold", f.opts.verifyThreshold, "abi", CppAbiVersion())
 
diff --git a/backend/go/voice-detect/govoicedetect.go b/backend/go/voice-detect/govoicedetect.go
index ea648e896..2bbe74bd0 100644
--- a/backend/go/voice-detect/govoicedetect.go
+++ b/backend/go/voice-detect/govoicedetect.go
@@ -5,7 +5,9 @@ import (
 	"errors"
 	"fmt"
 	"math"
+	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"time"
 	"unsafe"
@@ -63,6 +65,20 @@ func (v *VoiceDetect) Load(opts *pb.ModelOptions) error {
 		v.opts.modelName = filepath.Base(model)
 	}
 
+	// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
+	// one backend process per model and serves requests concurrently, so the
+	// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
+	// VOICEDETECT_THREADS is read by the engine at backend construction, so it
+	// must be set before the capi load. A non-positive Threads means "unset":
+	// leave the env alone so the engine keeps its sane default.
+	threads := opts.Threads
+	if threads > 0 {
+		if err := os.Setenv("VOICEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
+			return fmt.Errorf("voice-detect: set VOICEDETECT_THREADS: %w", err)
+		}
+		xlog.Info("voice-detect: applying LocalAI thread budget", "threads", threads)
+	}
+
 	xlog.Info("voice-detect: loading model", "model", model,
 		"verify_threshold", v.opts.verifyThreshold, "abi", CppAbiVersion())