mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-30 03:25:42 -04:00
* feat(distributed): add per-request node ID context holder Introduce pkg/distributedhdr, a leaf package carrying a per-request *atomic.Value holder for the picked worker node ID from the SmartRouter (core/services/nodes) up to the HTTP response writer wrapper (core/http/middleware). Avoids the import cycle that a shared key in either consumer would create. Exposes NewHolder, WithHolder, Holder, Stamp, Load, Inherit. The holder is atomic.Value so cross-goroutine publish from the router to the response writer wrapper is race-clean. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(distributed): add ExposeNodeHeader middleware + response writer wrapper New ApplicationConfig.ExposeNodeHeader bool + --expose-node-header CLI flag / LOCALAI_EXPOSE_NODE_HEADER env var (default off; the node ID reveals internal topology and is opt-in). The middleware creates a per-request *atomic.Value holder, attaches it to c.Request().Context() via distributedhdr.WithHolder, and wraps c.Response().Writer with a custom http.ResponseWriter that sets the X-LocalAI-Node header on first Write / WriteHeader / Flush by reading the holder. Implements http.Flusher, http.Hijacker, Unwrap so it composes cleanly with Echo and http.NewResponseController. request.go propagates the holder onto derived contexts via distributedhdr.Inherit so the holder survives the correlation-ID context replacement. Unit + race-clean concurrency + integration specs. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(distributed): stamp node ID in router and wire middleware to inference routes ModelRouterAdapter.Route stamps the picked node ID into the per-request holder via distributedhdr.Stamp(ctx, result.Node.ID) right after replica selection. Wire ExposeNodeHeader middleware to: - OpenAI chat/completion/embeddings + audio transcriptions/speech + image generations/inpainting - Anthropic /v1/messages - Ollama /api/chat, /api/generate, /api/embed, /api/embeddings - Jina /v1/rerank - LocalAI /v1/vad The middleware's wrapper reads the holder on first byte and sets the X-LocalAI-Node response header before delegating to the underlying writer. Per-request scope means no race under concurrent multi-replica routing. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(distributed): thread request context through backend Load + cover ctx propagation Five non-OpenAI backend helpers were silently using app.Context instead of the request context for the gRPC backend call: transcription, TTS, image generation, rerank, VAD. Effect: distributedhdr.Stamp in the router callback was a silent no-op for these paths, AND client cancellation didn't propagate to in-flight inference. Thread c.Request().Context() (or the equivalent input.Context after the request middleware has installed the correlation-ID derived context) through each helper and into ModelOptions via model.WithContext(ctx). ImageGeneration's signature gains a leading ctx parameter; in-tree callers (openai image, openai inpainting, openai inpainting_test) are updated to match. ModelEmbedding gains a leading ctx parameter for the same reason; the openai and ollama embedding handlers pass the request context through. chat_stream_workers.go defers the initial role=assistant chunk emission until the first token callback so the wrapper's lazy X-LocalAI-Node lookup against the loader runs AFTER ml.Load has stamped the per-modelID node ID; semantically identical for clients (role still arrives before any text). Regression test core/backend/ctx_propagation_test.go pins ctx propagation for all five helpers. Docs updated to enumerate the full endpoint coverage of the --expose-node-header flag. Assisted-by: Claude:claude-opus-4-7[1m] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
287 lines
7.9 KiB
Go
287 lines
7.9 KiB
Go
package backend
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/binary"
|
|
"encoding/json"
|
|
"fmt"
|
|
"maps"
|
|
"os"
|
|
"path/filepath"
|
|
"time"
|
|
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/core/trace"
|
|
laudio "github.com/mudler/LocalAI/pkg/audio"
|
|
|
|
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
"github.com/mudler/LocalAI/pkg/utils"
|
|
)
|
|
|
|
func ModelTTS(
|
|
ctx context.Context,
|
|
text,
|
|
voice,
|
|
language string,
|
|
loader *model.ModelLoader,
|
|
appConfig *config.ApplicationConfig,
|
|
modelConfig config.ModelConfig,
|
|
) (string, *proto.Result, error) {
|
|
// model.WithContext(ctx) overrides the app-context default set in
|
|
// ModelOptions so distributed routing decisions reach the request's
|
|
// X-LocalAI-Node holder via distributedhdr.Stamp.
|
|
opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
|
|
ttsModel, err := loader.Load(opts...)
|
|
if err != nil {
|
|
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
|
|
return "", nil, err
|
|
}
|
|
|
|
if ttsModel == nil {
|
|
return "", nil, fmt.Errorf("could not load tts model %q", modelConfig.Model)
|
|
}
|
|
|
|
audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio")
|
|
if err := os.MkdirAll(audioDir, 0750); err != nil {
|
|
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
|
|
}
|
|
|
|
fileName := utils.GenerateUniqueFileName(audioDir, "tts", ".wav")
|
|
filePath := filepath.Join(audioDir, fileName)
|
|
|
|
// We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect.
|
|
// This should be addressed in a follow up PR soon.
|
|
// Copying it over nearly verbatim, as TTS backends are not functional without this.
|
|
modelPath := ""
|
|
// Checking first that it exists and is not outside ModelPath
|
|
// TODO: we should actually first check if the modelFile is looking like
|
|
// a FS path
|
|
mp := filepath.Join(loader.ModelPath, modelConfig.Model)
|
|
if _, err := os.Stat(mp); err == nil {
|
|
if err := utils.VerifyPath(mp, appConfig.SystemState.Model.ModelsPath); err != nil {
|
|
return "", nil, err
|
|
}
|
|
modelPath = mp
|
|
} else {
|
|
modelPath = modelConfig.Model // skip this step if it fails?????
|
|
}
|
|
|
|
var startTime time.Time
|
|
if appConfig.EnableTracing {
|
|
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
|
|
startTime = time.Now()
|
|
}
|
|
|
|
res, err := ttsModel.TTS(ctx, &proto.TTSRequest{
|
|
Text: text,
|
|
Model: modelPath,
|
|
Voice: voice,
|
|
Dst: filePath,
|
|
Language: &language,
|
|
})
|
|
|
|
if appConfig.EnableTracing {
|
|
errStr := ""
|
|
if err != nil {
|
|
errStr = err.Error()
|
|
} else if !res.Success {
|
|
errStr = fmt.Sprintf("TTS error: %s", res.Message)
|
|
}
|
|
|
|
data := map[string]any{
|
|
"text": text,
|
|
"voice": voice,
|
|
"language": language,
|
|
}
|
|
if err == nil && res.Success {
|
|
if snippet := trace.AudioSnippet(filePath, appConfig.TracingMaxBodyBytes); snippet != nil {
|
|
maps.Copy(data, snippet)
|
|
}
|
|
}
|
|
trace.RecordBackendTrace(trace.BackendTrace{
|
|
Timestamp: startTime,
|
|
Duration: time.Since(startTime),
|
|
Type: trace.BackendTraceTTS,
|
|
ModelName: modelConfig.Name,
|
|
Backend: modelConfig.Backend,
|
|
Summary: trace.TruncateString(text, 200),
|
|
Error: errStr,
|
|
Data: data,
|
|
})
|
|
}
|
|
|
|
if err != nil {
|
|
return "", nil, err
|
|
}
|
|
|
|
// return RPC error if any
|
|
if !res.Success {
|
|
return "", nil, fmt.Errorf("error during TTS: %s", res.Message)
|
|
}
|
|
|
|
return filePath, res, err
|
|
}
|
|
|
|
func ModelTTSStream(
|
|
ctx context.Context,
|
|
text,
|
|
voice,
|
|
language string,
|
|
loader *model.ModelLoader,
|
|
appConfig *config.ApplicationConfig,
|
|
modelConfig config.ModelConfig,
|
|
audioCallback func([]byte) error,
|
|
) error {
|
|
opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
|
|
ttsModel, err := loader.Load(opts...)
|
|
if err != nil {
|
|
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
|
|
return err
|
|
}
|
|
|
|
if ttsModel == nil {
|
|
return fmt.Errorf("could not load tts model %q", modelConfig.Model)
|
|
}
|
|
|
|
// We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect.
|
|
// This should be addressed in a follow up PR soon.
|
|
// Copying it over nearly verbatim, as TTS backends are not functional without this.
|
|
modelPath := ""
|
|
// Checking first that it exists and is not outside ModelPath
|
|
// TODO: we should actually first check if the modelFile is looking like
|
|
// a FS path
|
|
mp := filepath.Join(loader.ModelPath, modelConfig.Model)
|
|
if _, err := os.Stat(mp); err == nil {
|
|
if err := utils.VerifyPath(mp, appConfig.SystemState.Model.ModelsPath); err != nil {
|
|
return err
|
|
}
|
|
modelPath = mp
|
|
} else {
|
|
modelPath = modelConfig.Model // skip this step if it fails?????
|
|
}
|
|
|
|
var startTime time.Time
|
|
if appConfig.EnableTracing {
|
|
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
|
|
startTime = time.Now()
|
|
}
|
|
|
|
var sampleRate uint32 = 16000 // default
|
|
headerSent := false
|
|
var callbackErr error
|
|
|
|
// Collect up to 30s of audio for tracing
|
|
var snippetPCM []byte
|
|
var totalPCMBytes int
|
|
snippetCapped := false
|
|
|
|
err = ttsModel.TTSStream(ctx, &proto.TTSRequest{
|
|
Text: text,
|
|
Model: modelPath,
|
|
Voice: voice,
|
|
Language: &language,
|
|
}, func(reply *proto.Reply) {
|
|
// First message contains sample rate info
|
|
if !headerSent && len(reply.Message) > 0 {
|
|
var info map[string]any
|
|
if json.Unmarshal(reply.Message, &info) == nil {
|
|
if sr, ok := info["sample_rate"].(float64); ok {
|
|
sampleRate = uint32(sr)
|
|
}
|
|
}
|
|
// Send WAV header with placeholder size (0xFFFFFFFF for streaming)
|
|
header := laudio.WAVHeader{
|
|
ChunkID: [4]byte{'R', 'I', 'F', 'F'},
|
|
ChunkSize: 0xFFFFFFFF, // Unknown size for streaming
|
|
Format: [4]byte{'W', 'A', 'V', 'E'},
|
|
Subchunk1ID: [4]byte{'f', 'm', 't', ' '},
|
|
Subchunk1Size: 16,
|
|
AudioFormat: 1, // PCM
|
|
NumChannels: 1, // Mono
|
|
SampleRate: sampleRate,
|
|
ByteRate: sampleRate * 2, // SampleRate * BlockAlign
|
|
BlockAlign: 2, // 16-bit = 2 bytes
|
|
BitsPerSample: 16,
|
|
Subchunk2ID: [4]byte{'d', 'a', 't', 'a'},
|
|
Subchunk2Size: 0xFFFFFFFF, // Unknown size for streaming
|
|
}
|
|
|
|
var buf bytes.Buffer
|
|
if writeErr := binary.Write(&buf, binary.LittleEndian, header); writeErr != nil {
|
|
callbackErr = writeErr
|
|
return
|
|
}
|
|
|
|
if writeErr := audioCallback(buf.Bytes()); writeErr != nil {
|
|
callbackErr = writeErr
|
|
return
|
|
}
|
|
headerSent = true
|
|
}
|
|
|
|
// Stream audio chunks
|
|
if len(reply.Audio) > 0 {
|
|
if writeErr := audioCallback(reply.Audio); writeErr != nil {
|
|
callbackErr = writeErr
|
|
}
|
|
// Accumulate PCM for tracing snippet
|
|
totalPCMBytes += len(reply.Audio)
|
|
if appConfig.EnableTracing && !snippetCapped {
|
|
maxBytes := int(sampleRate) * 2 * trace.MaxSnippetSeconds // 16-bit mono
|
|
if len(snippetPCM)+len(reply.Audio) <= maxBytes {
|
|
snippetPCM = append(snippetPCM, reply.Audio...)
|
|
} else {
|
|
remaining := maxBytes - len(snippetPCM)
|
|
if remaining > 0 {
|
|
// Align to sample boundary (2 bytes per sample)
|
|
remaining = remaining &^ 1
|
|
snippetPCM = append(snippetPCM, reply.Audio[:remaining]...)
|
|
}
|
|
snippetCapped = true
|
|
}
|
|
}
|
|
}
|
|
})
|
|
|
|
resultErr := err
|
|
if callbackErr != nil {
|
|
resultErr = callbackErr
|
|
}
|
|
|
|
if appConfig.EnableTracing {
|
|
errStr := ""
|
|
if resultErr != nil {
|
|
errStr = resultErr.Error()
|
|
}
|
|
|
|
data := map[string]any{
|
|
"text": text,
|
|
"voice": voice,
|
|
"language": language,
|
|
"streaming": true,
|
|
}
|
|
if resultErr == nil && len(snippetPCM) > 0 {
|
|
if snippet := trace.AudioSnippetFromPCM(snippetPCM, int(sampleRate), totalPCMBytes, appConfig.TracingMaxBodyBytes); snippet != nil {
|
|
maps.Copy(data, snippet)
|
|
}
|
|
}
|
|
trace.RecordBackendTrace(trace.BackendTrace{
|
|
Timestamp: startTime,
|
|
Duration: time.Since(startTime),
|
|
Type: trace.BackendTraceTTS,
|
|
ModelName: modelConfig.Name,
|
|
Backend: modelConfig.Backend,
|
|
Summary: trace.TruncateString(text, 200),
|
|
Error: errStr,
|
|
Data: data,
|
|
})
|
|
}
|
|
|
|
if callbackErr != nil {
|
|
return callbackErr
|
|
}
|
|
return err
|
|
}
|