mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-01 12:42:55 -04:00
test(distributed): cover streaming X-LocalAI-Node header end-to-end
The pre-existing buffered-handler tests only exercised
maybeSetNodeHeader against a pre-populated ModelLoader store. They did
nothing to verify that the streaming path attaches the header AFTER
ml.Load has stamped a node ID on the model, which is exactly the
ordering bug the streaming rendezvous chan fix addresses.
Add a streaming integration spec that:
- Builds a ModelLoader with a Model entry but NO node ID stamped on
it (so any pre-Load read returns empty).
- Installs a fake backend.ModelInferenceFunc that stamps the node ID
onto the Model AT THE MOMENT IT IS CALLED, matching production
timing where ModelRouterAdapter.Route does the stamp inside
ml.Load.
- Drives processStream with a per-request nodeIDCh and a handler-side
loop that mirrors chat.go's flush ordering (read response, apply
nodeIDCh header, write, flush).
- Asserts the recorded X-LocalAI-Node header equals the node ID the
fake backend stamped during the worker's ml.Load. With the pre-fix
code the header would be empty because the request goroutine read
the loader's node ID before the worker had stamped it.
Cover three additional scenarios:
- ExposeNodeHeader=false suppresses the header even after stamping
(opt-in is sacred).
- Two sequential requests each get THEIR OWN routing decision in the
header, not the prior request's; this is the direct regression
check for the original bug under load.
- The SSE body is still written so we don't regress streaming output
while attaching the header.
All four specs use Ginkgo; no stdlib testing patterns.
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-7[1m]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
256
core/http/endpoints/openai/node_header_stream_test.go
Normal file
256
core/http/endpoints/openai/node_header_stream_test.go
Normal file
@@ -0,0 +1,256 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/system"
|
||||
)
|
||||
|
||||
// Streaming integration coverage for the X-LocalAI-Node header. This is the
|
||||
// test class that would have caught the pre-fix bug where the streaming
|
||||
// branches of chat / completion set the header BEFORE ml.Load ran, so the
|
||||
// header reflected the previous request's routing decision (or was empty
|
||||
// on a cold cache) instead of THIS request's.
|
||||
//
|
||||
// The test drives the real streaming worker (processStream) plus a
|
||||
// handler-side flush loop modelled on chat.go's. The fake
|
||||
// backend.ModelInferenceFunc only stamps the loader's per-modelID NodeID
|
||||
// AT THE MOMENT IT IS CALLED (i.e. AT THE MOMENT ml.Load would run),
|
||||
// proving that the header is sourced from the post-Load value, not from
|
||||
// any state that could have been stashed earlier on the request goroutine.
|
||||
var _ = Describe("X-LocalAI-Node response header (streaming)", func() {
|
||||
const (
|
||||
modelID = "qwen3.5-0.8b"
|
||||
fakeNodeID = "node-stream-target"
|
||||
)
|
||||
|
||||
var (
|
||||
origInference modelInferenceFunc
|
||||
ml *model.ModelLoader
|
||||
appCfg *config.ApplicationConfig
|
||||
e *echo.Echo
|
||||
rec *httptest.ResponseRecorder
|
||||
c echo.Context
|
||||
)
|
||||
|
||||
BeforeEach(func() {
|
||||
origInference = backend.ModelInferenceFunc
|
||||
|
||||
// In-memory ModelLoader and store. We pre-create a Model entry
|
||||
// for the test model BUT we do NOT stamp NodeID yet: the fake
|
||||
// ModelInferenceFunc stamps it when invoked, simulating what
|
||||
// ModelRouterAdapter.Route does inside ml.Load in production.
|
||||
ml = model.NewModelLoader(&system.SystemState{})
|
||||
fakeModel := model.NewModelWithClient(modelID, "10.0.0.1:50051", nil)
|
||||
fakeModel.MarkHealthy()
|
||||
store := model.NewInMemoryModelStore()
|
||||
store.Set(modelID, fakeModel)
|
||||
ml.SetModelStore(store)
|
||||
|
||||
appCfg = config.NewApplicationConfig()
|
||||
appCfg.ExposeNodeHeader = true
|
||||
|
||||
e = echo.New()
|
||||
req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil)
|
||||
rec = httptest.NewRecorder()
|
||||
c = e.NewContext(req, rec)
|
||||
})
|
||||
|
||||
AfterEach(func() {
|
||||
backend.ModelInferenceFunc = origInference
|
||||
})
|
||||
|
||||
// stampNodeIDOnLoad replaces backend.ModelInferenceFunc with a stub
|
||||
// that, when invoked (the moment ml.Load runs in the real flow):
|
||||
// 1. Stamps the Model in the loader with `nodeID` (mimics
|
||||
// ModelRouterAdapter.Route → m.SetNodeID).
|
||||
// 2. Returns a predFunc that fires the token callback once with
|
||||
// `tokenText` so the worker proceeds through its normal stream
|
||||
// path.
|
||||
// The stamping happens inside ModelInferenceFunc itself - NOT in the
|
||||
// returned predFunc - to match production timing where the node ID
|
||||
// is bound by ml.Load before any prediction work runs.
|
||||
stampNodeIDOnLoad := func(nodeID, tokenText string) {
|
||||
backend.ModelInferenceFunc = func(
|
||||
ctx context.Context, s string, messages schema.Messages,
|
||||
images, videos, audios []string,
|
||||
loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader,
|
||||
o *config.ApplicationConfig,
|
||||
tokenCallback func(string, backend.TokenUsage) bool,
|
||||
tools, toolChoice string,
|
||||
logprobs, topLogprobs *int,
|
||||
logitBias map[string]float64,
|
||||
metadata map[string]string,
|
||||
) (func() (backend.LLMResponse, error), error) {
|
||||
if loader != nil {
|
||||
if m := loader.CheckIsLoaded(modelID); m != nil {
|
||||
m.SetNodeID(nodeID)
|
||||
}
|
||||
}
|
||||
predFunc := func() (backend.LLMResponse, error) {
|
||||
if tokenCallback != nil {
|
||||
tokenCallback(tokenText, backend.TokenUsage{
|
||||
Prompt: 5,
|
||||
Completion: 3,
|
||||
})
|
||||
}
|
||||
return backend.LLMResponse{
|
||||
Response: tokenText,
|
||||
Usage: backend.TokenUsage{Prompt: 5, Completion: 3},
|
||||
}, nil
|
||||
}
|
||||
return predFunc, nil
|
||||
}
|
||||
}
|
||||
|
||||
// runStreamingHandler mirrors the relevant portion of chat.go's
|
||||
// streaming branch: launch the worker, drain responses, attach the
|
||||
// X-LocalAI-Node header from nodeIDCh before the first flush. It
|
||||
// exercises the exact wiring that the production handler uses; if
|
||||
// the wiring is wrong (Critical 1 regression) this loop will write
|
||||
// SSE chunks to the recorder without ever attaching the header.
|
||||
runStreamingHandler := func(req *schema.OpenAIRequest, cfg *config.ModelConfig) {
|
||||
responses := make(chan schema.OpenAIResponse)
|
||||
nodeIDCh := make(chan string, 1)
|
||||
ended := make(chan error, 1)
|
||||
|
||||
go func() {
|
||||
_, err := processStream("prompt", req, cfg, nil, appCfg, ml, responses, "req-1", 0, nodeIDCh)
|
||||
ended <- err
|
||||
}()
|
||||
|
||||
nodeIDApplied := false
|
||||
applyNodeIDHeader := func() {
|
||||
if nodeIDApplied {
|
||||
return
|
||||
}
|
||||
select {
|
||||
case nodeID := <-nodeIDCh:
|
||||
nodeIDApplied = true
|
||||
if nodeID != "" {
|
||||
c.Response().Header().Set(NodeHeaderName, nodeID)
|
||||
}
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
// Mirror chat.go's main streaming loop: read each chunk,
|
||||
// call applyNodeIDHeader BEFORE writing/flushing, then write
|
||||
// the chunk to the response writer.
|
||||
LOOP:
|
||||
for {
|
||||
select {
|
||||
case ev, ok := <-responses:
|
||||
if !ok {
|
||||
break LOOP
|
||||
}
|
||||
applyNodeIDHeader()
|
||||
_, _ = c.Response().Write([]byte("data: chunk\n\n"))
|
||||
c.Response().Flush()
|
||||
_ = ev
|
||||
case err := <-ended:
|
||||
// Drain any remaining responses to let the worker exit.
|
||||
if err == nil {
|
||||
for ev := range responses {
|
||||
_ = ev
|
||||
}
|
||||
}
|
||||
break LOOP
|
||||
case <-time.After(2 * time.Second):
|
||||
Fail("streaming handler timed out waiting for chunks")
|
||||
}
|
||||
}
|
||||
// Wait for worker to finish to keep AfterEach deterministic.
|
||||
select {
|
||||
case <-ended:
|
||||
case <-time.After(2 * time.Second):
|
||||
Fail("worker goroutine did not finish in time")
|
||||
}
|
||||
}
|
||||
|
||||
It("attaches the picked node ID to a streaming response after ml.Load runs", func() {
|
||||
stampNodeIDOnLoad(fakeNodeID, "hello")
|
||||
|
||||
// Sanity check: before processStream runs, the model has NO
|
||||
// node ID. This isolates the test from any pre-stamp shortcut
|
||||
// that bypasses the post-Load read.
|
||||
Expect(ml.CheckIsLoaded(modelID).NodeID()).To(BeEmpty(),
|
||||
"precondition: node ID must be empty before processStream is called")
|
||||
|
||||
req := &schema.OpenAIRequest{
|
||||
Context: context.Background(),
|
||||
}
|
||||
req.Model = modelID
|
||||
|
||||
runStreamingHandler(req, &config.ModelConfig{})
|
||||
|
||||
Expect(rec.Header().Get(NodeHeaderName)).To(Equal(fakeNodeID),
|
||||
"header must reflect the node ID stamped DURING ml.Load (this request's routing decision)")
|
||||
})
|
||||
|
||||
It("omits the header when ExposeNodeHeader is off, even after the loader stamps a node ID", func() {
|
||||
appCfg.ExposeNodeHeader = false
|
||||
stampNodeIDOnLoad(fakeNodeID, "hello")
|
||||
|
||||
req := &schema.OpenAIRequest{
|
||||
Context: context.Background(),
|
||||
}
|
||||
req.Model = modelID
|
||||
|
||||
runStreamingHandler(req, &config.ModelConfig{})
|
||||
|
||||
Expect(rec.Header().Get(NodeHeaderName)).To(BeEmpty(),
|
||||
"feature is opt-in; off must mean off even when a node ID is available")
|
||||
})
|
||||
|
||||
It("does not attach the previous request's node ID to a new request (per-request scope)", func() {
|
||||
// Round 1: route to node A and run the streaming handler.
|
||||
stampNodeIDOnLoad("node-A", "first")
|
||||
|
||||
req1 := &schema.OpenAIRequest{Context: context.Background()}
|
||||
req1.Model = modelID
|
||||
runStreamingHandler(req1, &config.ModelConfig{})
|
||||
Expect(rec.Header().Get(NodeHeaderName)).To(Equal("node-A"))
|
||||
|
||||
// Round 2: a new request must independently re-route to node B
|
||||
// and attach node-B to its own response. With the pre-fix bug
|
||||
// (header set on the request goroutine BEFORE ml.Load),
|
||||
// request 2 would carry node-A here because the worker
|
||||
// goroutine hadn't yet overwritten the loader's stamp.
|
||||
stampNodeIDOnLoad("node-B", "second")
|
||||
rec = httptest.NewRecorder()
|
||||
c = e.NewContext(httptest.NewRequest(http.MethodPost, "/v1/chat/completions", nil), rec)
|
||||
|
||||
req2 := &schema.OpenAIRequest{Context: context.Background()}
|
||||
req2.Model = modelID
|
||||
runStreamingHandler(req2, &config.ModelConfig{})
|
||||
|
||||
Expect(rec.Header().Get(NodeHeaderName)).To(Equal("node-B"),
|
||||
"each request must receive ITS OWN routing decision in the header, not the prior request's")
|
||||
})
|
||||
|
||||
It("writes the SSE body unchanged when the streaming flow runs end-to-end", func() {
|
||||
stampNodeIDOnLoad(fakeNodeID, "hello")
|
||||
|
||||
req := &schema.OpenAIRequest{Context: context.Background()}
|
||||
req.Model = modelID
|
||||
runStreamingHandler(req, &config.ModelConfig{})
|
||||
|
||||
// Defensive: the harness above writes a stub SSE payload, so the
|
||||
// body should never be empty when the worker produced tokens.
|
||||
Expect(strings.Contains(rec.Body.String(), "data:")).To(BeTrue(),
|
||||
"streaming body must contain at least one SSE chunk")
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user