Files
LocalAI/tests/e2e-aio/e2e_test.go
Richard Palethorpe 085fc53bbc fix(router): production-ready request router + auto-size batch for embedding/rerank (#10104)
* fix(router): score classifier production-readiness

Conversation trimming runs through the classifier model's chat template
and trims by exact token count, sized to the model's n_batch which is
now scaled to context so long probes can't crash the backend. Missing
chat_message templates are a hard error at router build time. Router-
facing factories (Embedder/Scorer/Reranker/TokenCounter) re-resolve
ModelConfig per call so a model installed post-startup doesn't bind a
stub Backend="" config and silently fall into the loader's auto-
iterate path.

New 'vector_store' backend trace recorded inside localVectorStore on
every Search/Insert — including the backend-load-failure path that
previously vanished into an xlog.Warn — with outcome tagging
(hit/miss/empty_store/backend_load_error/find_error/insert_error/ok).
Companion cleanup drops misleading similarity:0 and input_tokens_count:0
from non-hit and text-mode traces.

Gallery local-store-development aliases to 'local-store' so the master
image satisfies pkg/model.LocalStoreBackend lookups from the embedding
cache.

Misc: llama-cpp TokenizeString reads the correct 'prompt' JSON key
(the original bug); ModelTokenize nil-guard; non-fatal mitm proxy
startup; PII 'route_local' renamed to 'allow' with docs/UI in sync;
model-editor footer no longer eats the edit area on small screens;
several config-editor template/dropdown/section fixes.

Tests: e2e router specs (casual/code-hint + long-conversation trim),
vector_store trace specs, lazy-factory specs, gallery dev-alias
resolution, Playwright trace badge + scroll regression.

Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>

* feat(backend): auto-size batch to context for embedding and rerank models

Embedding and rerank models pool over the whole input in a single physical batch (n_ubatch). With batch left at the 512 default, the backend rejects longer inputs with "input is too large to process", silently capping a large-context embedder (e.g. 8k/32k) at 512 tokens. Size n_batch to the context for these single-pass usecases, mirroring the existing FLAG_SCORE behaviour; an explicit batch: still wins.

Extracts EffectiveContextSize/EffectiveBatchSize from grpcModelOpts so the effective decode window has one home for other callers to reuse.

Adds an e2e-aio regression test that embeds a >512-token input. The AIO embedding model is switched to nomic-embed-text-v1.5 (2048 context) because the previous granite model was capped at 512 tokens and could not exercise the larger batch.

Assisted-by: claude-code:claude-opus-4-8 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(gallery): raise arch-router scoring output cap via parallel:64

Scoring decodes the whole prompt+candidate in a single llama_decode and
reads one logit row per candidate token. The vendored llama.cpp server
caps causal output rows at n_parallel, so the default of 1 aborts with
GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) on multi-token route
labels. Set options: [parallel:64] on both arch-router quant entries to
lift the cap; kv_unified (the grpc-server default) keeps the full context
per sequence, so this does not split the KV cache.

Assisted-by: claude-code:claude-opus-4-8 [Claude Code]
Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-06-12 16:21:15 +02:00

489 lines
17 KiB
Go

package e2e_test
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"github.com/mudler/LocalAI/core/schema"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/openai/openai-go/v3"
"github.com/openai/openai-go/v3/option"
)
var _ = Describe("E2E test", func() {
Context("Generating", func() {
BeforeEach(func() {
//
})
// Check that the GPU was used
AfterEach(func() {
//
})
Context("text", func() {
It("correctly", func() {
model := "gpt-4"
resp, err := client.Chat.Completions.New(context.TODO(),
openai.ChatCompletionNewParams{
Model: model,
Messages: []openai.ChatCompletionMessageParamUnion{
openai.UserMessage("How much is 2+2?"),
},
})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Choices)).To(Equal(1), fmt.Sprint(resp))
Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("4"), ContainSubstring("four")), fmt.Sprint(resp.Choices[0].Message.Content))
})
// Smoke: verifies the AIO container streams chat completions end-to-end.
// Catches packaging/proxy regressions where the streaming path breaks
// even though non-streaming works.
It("streams correctly", func() {
model := "gpt-4"
stream := client.Chat.Completions.NewStreaming(context.TODO(),
openai.ChatCompletionNewParams{
Model: model,
Messages: []openai.ChatCompletionMessageParamUnion{
openai.UserMessage("Count to three."),
},
})
defer stream.Close()
var chunks int
var combined string
for stream.Next() {
chunk := stream.Current()
if len(chunk.Choices) > 0 && chunk.Choices[0].Delta.Content != "" {
chunks++
combined += chunk.Choices[0].Delta.Content
}
}
Expect(stream.Err()).ToNot(HaveOccurred())
Expect(chunks).To(BeNumerically(">", 1), "expected multi-chunk stream, got %d", chunks)
Expect(combined).ToNot(BeEmpty(), "stream produced no content")
})
})
Context("function calls", func() {
It("correctly invoke", func() {
params := openai.FunctionParameters{
"type": "object",
"properties": map[string]any{
"location": map[string]string{
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": map[string]any{
"type": "string",
"enum": []string{"celsius", "fahrenheit"},
},
},
"required": []string{"location"},
}
tool := openai.ChatCompletionToolUnionParam{
OfFunction: &openai.ChatCompletionFunctionToolParam{
Function: openai.FunctionDefinitionParam{
Name: "get_current_weather",
Description: openai.String("Get the current weather in a given location"),
Parameters: params,
},
},
}
resp, err := client.Chat.Completions.New(context.TODO(),
openai.ChatCompletionNewParams{
Model: openai.ChatModelGPT4,
Messages: []openai.ChatCompletionMessageParamUnion{openai.UserMessage("What is the weather in Boston today?")},
Tools: []openai.ChatCompletionToolUnionParam{tool},
},
)
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Choices)).To(Equal(1), fmt.Sprint(resp))
msg := resp.Choices[0].Message
Expect(len(msg.ToolCalls)).To(Equal(1), fmt.Sprint(msg.ToolCalls))
Expect(msg.ToolCalls[0].Function.Name).To(Equal("get_current_weather"), fmt.Sprint(msg.ToolCalls[0].Function.Name))
Expect(msg.ToolCalls[0].Function.Arguments).To(ContainSubstring("Boston"), fmt.Sprint(msg.ToolCalls[0].Function.Arguments))
})
})
Context("json", func() {
It("correctly", func() {
model := "gpt-4"
resp, err := client.Chat.Completions.New(context.TODO(),
openai.ChatCompletionNewParams{
Model: model,
Messages: []openai.ChatCompletionMessageParamUnion{
openai.UserMessage("Generate a JSON object of an animal with 'name', 'gender' and 'legs' fields"),
},
ResponseFormat: openai.ChatCompletionNewParamsResponseFormatUnion{
OfJSONObject: &openai.ResponseFormatJSONObjectParam{},
},
})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Choices)).To(Equal(1), fmt.Sprint(resp))
var i map[string]any
err = json.Unmarshal([]byte(resp.Choices[0].Message.Content), &i)
Expect(err).ToNot(HaveOccurred())
Expect(i).To(HaveKey("name"))
Expect(i).To(HaveKey("gender"))
Expect(i).To(HaveKey("legs"))
})
})
Context("images", func() {
It("correctly", func() {
resp, err := client.Images.Generate(context.TODO(),
openai.ImageGenerateParams{
Prompt: "test",
Size: openai.ImageGenerateParamsSize256x256,
Quality: openai.ImageGenerateParamsQualityLow,
})
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("error sending image request"))
Expect(len(resp.Data)).To(Equal(1), fmt.Sprint(resp))
Expect(resp.Data[0].URL).To(ContainSubstring("png"), fmt.Sprint(resp.Data[0].URL))
})
It("correctly changes the response format to url", func() {
resp, err := client.Images.Generate(context.TODO(),
openai.ImageGenerateParams{
Prompt: "test",
Size: openai.ImageGenerateParamsSize256x256,
ResponseFormat: openai.ImageGenerateParamsResponseFormatURL,
Quality: openai.ImageGenerateParamsQualityLow,
},
)
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Data)).To(Equal(1), fmt.Sprint(resp))
Expect(resp.Data[0].URL).To(ContainSubstring("png"), fmt.Sprint(resp.Data[0].URL))
})
It("correctly changes the response format to base64", func() {
resp, err := client.Images.Generate(context.TODO(),
openai.ImageGenerateParams{
Prompt: "test",
Size: openai.ImageGenerateParamsSize256x256,
ResponseFormat: openai.ImageGenerateParamsResponseFormatB64JSON,
},
)
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Data)).To(Equal(1), fmt.Sprint(resp))
Expect(resp.Data[0].B64JSON).ToNot(BeEmpty(), fmt.Sprint(resp.Data[0].B64JSON))
})
})
Context("embeddings", func() {
It("correctly", func() {
resp, err := client.Embeddings.New(context.TODO(),
openai.EmbeddingNewParams{
Input: openai.EmbeddingNewParamsInputUnion{
OfArrayOfStrings: []string{"doc"},
},
Model: openai.EmbeddingModelTextEmbeddingAda002,
},
)
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Data)).To(Equal(1), fmt.Sprint(resp))
Expect(resp.Data[0].Embedding).ToNot(BeEmpty())
resp2, err := client.Embeddings.New(context.TODO(),
openai.EmbeddingNewParams{
Input: openai.EmbeddingNewParamsInputUnion{
OfArrayOfStrings: []string{"cat"},
},
Model: openai.EmbeddingModelTextEmbeddingAda002,
},
)
Expect(err).ToNot(HaveOccurred())
Expect(len(resp2.Data)).To(Equal(1), fmt.Sprint(resp))
Expect(resp2.Data[0].Embedding).ToNot(BeEmpty())
Expect(resp2.Data[0].Embedding).ToNot(Equal(resp.Data[0].Embedding))
resp3, err := client.Embeddings.New(context.TODO(),
openai.EmbeddingNewParams{
Input: openai.EmbeddingNewParamsInputUnion{
OfArrayOfStrings: []string{"doc", "cat"},
},
Model: openai.EmbeddingModelTextEmbeddingAda002,
},
)
Expect(err).ToNot(HaveOccurred())
Expect(len(resp3.Data)).To(Equal(2), fmt.Sprint(resp))
Expect(resp3.Data[0].Embedding).ToNot(BeEmpty())
Expect(resp3.Data[0].Embedding).To(Equal(resp.Data[0].Embedding))
Expect(resp3.Data[1].Embedding).To(Equal(resp2.Data[0].Embedding))
Expect(resp3.Data[0].Embedding).ToNot(Equal(resp3.Data[1].Embedding))
})
// Regression guard for the auto-batch fix (core/backend/options.go
// EffectiveBatchSize). Embeddings pool over the whole sequence in a
// single physical batch (n_ubatch == n_batch), so an input longer
// than n_batch is rejected by the backend with "input is too large
// to process". Before the fix n_batch defaulted to 512 regardless of
// the model's context, so any prompt over ~512 tokens failed here.
// The embedding model is configured with a 2048 context (see
// models/embeddings.yaml); this input is comfortably over 512 tokens
// and under that context, so it must embed in one pass.
It("embeds an input larger than the default 512 batch", func() {
var b bytes.Buffer
// ~100 short sentences ≈ 1000+ tokens: well past the old 512
// batch ceiling, well within the 2048 context.
for i := range 100 {
fmt.Fprintf(&b, "This is sentence number %d discussing organic skincare and machine learning. ", i)
}
longInput := b.String()
resp, err := client.Embeddings.New(context.TODO(),
openai.EmbeddingNewParams{
Input: openai.EmbeddingNewParamsInputUnion{
OfArrayOfStrings: []string{longInput},
},
Model: openai.EmbeddingModelTextEmbeddingAda002,
},
)
Expect(err).ToNot(HaveOccurred(), "a >512-token input must embed in a single batch (auto-batch sizing)")
Expect(len(resp.Data)).To(Equal(1), fmt.Sprint(resp))
Expect(resp.Data[0].Embedding).ToNot(BeEmpty())
})
})
Context("vision", func() {
It("correctly", func() {
model := "gpt-4o"
resp, err := client.Chat.Completions.New(context.TODO(),
openai.ChatCompletionNewParams{
Model: model,
Messages: []openai.ChatCompletionMessageParamUnion{
{
OfUser: &openai.ChatCompletionUserMessageParam{
Role: "user",
Content: openai.ChatCompletionUserMessageParamContentUnion{
OfArrayOfContentParts: []openai.ChatCompletionContentPartUnionParam{
{
OfText: &openai.ChatCompletionContentPartTextParam{
Type: "text",
Text: "What is in the image?",
},
},
{
OfImageURL: &openai.ChatCompletionContentPartImageParam{
ImageURL: openai.ChatCompletionContentPartImageImageURLParam{
URL: "https://picsum.photos/id/22/4434/3729",
Detail: "low",
},
},
},
},
},
},
},
},
})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Choices)).To(Equal(1), fmt.Sprint(resp))
Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("man"), ContainSubstring("road")), fmt.Sprint(resp.Choices[0].Message.Content))
})
})
Context("text to audio", func() {
It("correctly", func() {
res, err := client.Audio.Speech.New(context.Background(), openai.AudioSpeechNewParams{
Model: openai.SpeechModelTTS1,
Input: "Hello!",
Voice: openai.AudioSpeechNewParamsVoiceAlloy,
})
Expect(err).ToNot(HaveOccurred())
defer res.Body.Close()
_, err = io.ReadAll(res.Body)
Expect(err).ToNot(HaveOccurred())
})
})
Context("audio to text", func() {
It("correctly", func() {
downloadURL := "https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav"
file, err := downloadHttpFile(downloadURL)
Expect(err).ToNot(HaveOccurred())
fileHandle, err := os.Open(file)
Expect(err).ToNot(HaveOccurred())
defer fileHandle.Close()
transcriptionResp, err := client.Audio.Transcriptions.New(context.Background(), openai.AudioTranscriptionNewParams{
Model: openai.AudioModelWhisper1,
File: fileHandle,
})
Expect(err).ToNot(HaveOccurred())
resp := transcriptionResp.AsTranscription()
Expect(resp.Text).To(ContainSubstring("This is the"), fmt.Sprint(resp.Text))
})
It("with VTT format", func() {
downloadURL := "https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav"
file, err := downloadHttpFile(downloadURL)
Expect(err).ToNot(HaveOccurred())
fileHandle, err := os.Open(file)
Expect(err).ToNot(HaveOccurred())
defer fileHandle.Close()
var resp string
_, err = client.Audio.Transcriptions.New(context.Background(), openai.AudioTranscriptionNewParams{
Model: openai.AudioModelWhisper1,
File: fileHandle,
ResponseFormat: openai.AudioResponseFormatVTT,
}, option.WithResponseBodyInto(&resp))
Expect(err).ToNot(HaveOccurred())
Expect(resp).To(ContainSubstring("This is the"), resp)
Expect(resp).To(ContainSubstring("WEBVTT"), resp)
Expect(resp).To(ContainSubstring("00:00:00.000 -->"), resp)
})
})
Context("vad", func() {
It("correctly", func() {
modelName := "silero-vad"
req := schema.VADRequest{
BasicModelRequest: schema.BasicModelRequest{
Model: modelName,
},
Audio: SampleVADAudio, // Use hardcoded sample data for now.
}
serialized, err := json.Marshal(req)
Expect(err).To(BeNil())
Expect(serialized).ToNot(BeNil())
vadEndpoint := apiEndpoint + "/vad"
resp, err := http.Post(vadEndpoint, "application/json", bytes.NewReader(serialized))
Expect(err).To(BeNil())
Expect(resp).ToNot(BeNil())
body, err := io.ReadAll(resp.Body)
Expect(err).ToNot(HaveOccurred())
Expect(resp.StatusCode).To(Equal(200))
deserializedResponse := schema.VADResponse{}
err = json.Unmarshal(body, &deserializedResponse)
Expect(err).To(BeNil())
Expect(deserializedResponse).ToNot(BeZero())
Expect(deserializedResponse.Segments).ToNot(BeZero())
})
})
Context("reranker", func() {
It("correctly", func() {
modelName := "jina-reranker-v1-base-en"
const query = "Organic skincare products for sensitive skin"
var documents = []string{
"Eco-friendly kitchenware for modern homes",
"Biodegradable cleaning supplies for eco-conscious consumers",
"Organic cotton baby clothes for sensitive skin",
"Natural organic skincare range for sensitive skin",
"Tech gadgets for smart homes: 2024 edition",
"Sustainable gardening tools and compost solutions",
"Sensitive skin-friendly facial cleansers and toners",
"Organic food wraps and storage solutions",
"All-natural pet food for dogs with allergies",
"Yoga mats made from recycled materials",
}
// Exceed len or requested results
randomValue := int(GinkgoRandomSeed()) % (len(documents) + 1)
requestResults := randomValue + 1 // at least 1 results
// Cap expectResults by the length of documents
expectResults := min(requestResults, len(documents))
var maybeSkipTopN = &requestResults
if requestResults >= len(documents) && int(GinkgoRandomSeed())%2 == 0 {
maybeSkipTopN = nil
}
resp, body := requestRerank(modelName, query, documents, maybeSkipTopN, apiEndpoint)
Expect(resp.StatusCode).To(Equal(200), fmt.Sprintf("body: %s, response: %+v", body, resp))
deserializedResponse := schema.JINARerankResponse{}
err := json.Unmarshal(body, &deserializedResponse)
Expect(err).To(BeNil())
Expect(deserializedResponse).ToNot(BeZero())
Expect(deserializedResponse.Model).To(Equal(modelName))
Expect(len(deserializedResponse.Results)).To(Equal(expectResults))
// Assert that relevance scores are in decreasing order
for i := 1; i < len(deserializedResponse.Results); i++ {
Expect(deserializedResponse.Results[i].RelevanceScore).To(
BeNumerically("<=", deserializedResponse.Results[i-1].RelevanceScore),
fmt.Sprintf("Result at index %d should have lower relevance score than previous result.", i),
)
}
// Assert that each result's index points to the correct document
for i, result := range deserializedResponse.Results {
Expect(result.Index).To(
And(
BeNumerically(">=", 0),
BeNumerically("<", len(documents)),
),
fmt.Sprintf("Result at position %d has index %d which should be within bounds [0, %d)", i, result.Index, len(documents)),
)
Expect(result.Document.Text).To(
Equal(documents[result.Index]),
fmt.Sprintf("Result at position %d (index %d) should have document text '%s', but got '%s'",
i, result.Index, documents[result.Index], result.Document.Text),
)
}
zeroOrNeg := int(GinkgoRandomSeed())%2 - 1 // Results in either -1 or 0
resp, body = requestRerank(modelName, query, documents, &zeroOrNeg, apiEndpoint)
Expect(resp.StatusCode).To(Equal(422), fmt.Sprintf("body: %s, response: %+v", body, resp))
})
})
})
})
func downloadHttpFile(url string) (string, error) {
resp, err := http.Get(url)
if err != nil {
return "", err
}
defer resp.Body.Close()
tmpfile, err := os.CreateTemp("", "example")
if err != nil {
return "", err
}
defer tmpfile.Close()
_, err = io.Copy(tmpfile, resp.Body)
if err != nil {
return "", err
}
return tmpfile.Name(), nil
}
func requestRerank(modelName, query string, documents []string, topN *int, apiEndpoint string) (*http.Response, []byte) {
req := schema.JINARerankRequest{
BasicModelRequest: schema.BasicModelRequest{
Model: modelName,
},
Query: query,
Documents: documents,
TopN: topN,
}
serialized, err := json.Marshal(req)
Expect(err).To(BeNil())
Expect(serialized).ToNot(BeNil())
rerankerEndpoint := apiEndpoint + "/rerank"
resp, err := http.Post(rerankerEndpoint, "application/json", bytes.NewReader(serialized))
Expect(err).To(BeNil())
Expect(resp).ToNot(BeNil())
body, err := io.ReadAll(resp.Body)
Expect(err).ToNot(HaveOccurred())
return resp, body
}