mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-30 03:55:58 -04:00
refactor(tests): split app_test.go, move real-backend coverage to e2e-backends
core/http/app_test.go had grown to 1495 lines exercising three concerns at
once: HTTP-layer integration, real-backend inference (llama-gguf, tts,
stablediffusion, transformers embeddings, whisper), and service logic that
already has unit-level coverage. Each PR paid for 6 backend builds plus
real-model downloads to satisfy a single suite.
Reorg per layer:
- app_test.go (1495 -> 1003 lines) drives the mock-backend binary only.
Kept: auth, routing, gallery API, file:// import, /system, agent-jobs
HTTP plumbing, config-file model loading. Deleted real-inference specs
(llama-gguf chat, ggml completions/streaming, logprobs, logit_bias,
transcription, embeddings, External-gRPC, Stores duplicate, Model gallery
Context). Lifted Agent Jobs out of the deleted Stores Context.
- tests/e2e-backends/backend_test.go gains logprobs, logit_bias, and
no-first-token-dup specs (the latter folded into PredictStream). Two
new caps gate them so non-LLM backends opt out.
- tests/e2e-aio/e2e_test.go gains a streaming smoke under Context("text")
to catch container-level streaming regressions.
- tests/models_fixtures/ removed; all fixtures referenced testmodel.ggml.
app_test.go now writes per-Context inline mock-model YAMLs.
CI:
- test.yml + tests-e2e.yml gain paths-ignore (docs/, examples/, *.md,
backend/) so docs and backend-only PRs skip them. test.yml drops the
6-backend Build step plus TRANSFORMER_BACKEND/GO_TAGS=tts; tests-apple
drops the llama-cpp-darwin build.
- New tests-aio.yml runs the AIO container nightly + on workflow_dispatch
+ master/tags. The tests-e2e-container job moved out of test.yml so PRs
no longer pay AIO cost.
- New tests-llama-cpp-smoke job in test-extra.yml runs on every PR with
no detect-changes gate; pulls quay.io/go-skynet/local-ai-backends:
master-cpu-llama-cpp (no build on PR) and exercises predict/stream/
logprobs/logit_bias against Qwen3-0.6B. This is the PR-acceptance
real-backend gate after AIO moved to nightly. The path-gated heavy
test-extra-backend-llama-cpp wrapper appends the same caps so it
exercises the moved specs when the backend actually changes.
Makefile:
- Deleted test-models/testmodel.ggml (the wget chain), test-llama-gguf,
test-tts, test-stablediffusion, test-realtime-models. test target
drops --label-filter, HUGGINGFACE_GRPC, TRANSFORMER_BACKEND, TEST_DIR,
FIXTURES, CONFIG_FILE, MODELS_PATH, BACKENDS_PATH; depends on
build-mock-backend. test-stores keeps a focused entry point and depends
on backends/local-store. clean-tests also clears the mock-backend
binary.
Net per typical Go-side PR: ~25min (6 backend builds + tests + AIO) +
~8min e2e drops to ~5min mock-backend test + ~8min e2e + ~5-10min
llama-cpp-smoke (image pulled). Docs and backend-only PRs skip the
always-on workflows entirely.
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: claude-code:claude-opus-4-7 [Edit] [Write] [Bash]
This commit is contained in:
@@ -41,6 +41,34 @@ var _ = Describe("E2E test", func() {
|
||||
Expect(len(resp.Choices)).To(Equal(1), fmt.Sprint(resp))
|
||||
Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("4"), ContainSubstring("four")), fmt.Sprint(resp.Choices[0].Message.Content))
|
||||
})
|
||||
|
||||
// Smoke: verifies the AIO container streams chat completions end-to-end.
|
||||
// Catches packaging/proxy regressions where the streaming path breaks
|
||||
// even though non-streaming works.
|
||||
It("streams correctly", func() {
|
||||
model := "gpt-4"
|
||||
stream := client.Chat.Completions.NewStreaming(context.TODO(),
|
||||
openai.ChatCompletionNewParams{
|
||||
Model: model,
|
||||
Messages: []openai.ChatCompletionMessageParamUnion{
|
||||
openai.UserMessage("Count to three."),
|
||||
},
|
||||
})
|
||||
defer stream.Close()
|
||||
|
||||
var chunks int
|
||||
var combined string
|
||||
for stream.Next() {
|
||||
chunk := stream.Current()
|
||||
if len(chunk.Choices) > 0 && chunk.Choices[0].Delta.Content != "" {
|
||||
chunks++
|
||||
combined += chunk.Choices[0].Delta.Content
|
||||
}
|
||||
}
|
||||
Expect(stream.Err()).ToNot(HaveOccurred())
|
||||
Expect(chunks).To(BeNumerically(">", 1), "expected multi-chunk stream, got %d", chunks)
|
||||
Expect(combined).ToNot(BeEmpty(), "stream produced no content")
|
||||
})
|
||||
})
|
||||
|
||||
Context("function calls", func() {
|
||||
|
||||
@@ -102,6 +102,8 @@ const (
|
||||
capVoiceEmbed = "voice_embed"
|
||||
capVoiceVerify = "voice_verify"
|
||||
capVoiceAnalyze = "voice_analyze"
|
||||
capLogprobs = "logprobs"
|
||||
capLogitBias = "logit_bias"
|
||||
|
||||
defaultPrompt = "The capital of France is"
|
||||
streamPrompt = "Once upon a time"
|
||||
@@ -422,6 +424,7 @@ var _ = Describe("Backend container", Ordered, func() {
|
||||
|
||||
var chunks int
|
||||
var combined string
|
||||
var firstChunks []string
|
||||
for {
|
||||
msg, err := stream.Recv()
|
||||
if err == io.EOF {
|
||||
@@ -431,12 +434,71 @@ var _ = Describe("Backend container", Ordered, func() {
|
||||
if len(msg.GetMessage()) > 0 {
|
||||
chunks++
|
||||
combined += string(msg.GetMessage())
|
||||
if len(firstChunks) < 2 {
|
||||
firstChunks = append(firstChunks, string(msg.GetMessage()))
|
||||
}
|
||||
}
|
||||
}
|
||||
Expect(chunks).To(BeNumerically(">", 0), "no stream chunks received")
|
||||
// Regression guard: a bug in llama-cpp's grpc-server.cpp caused the
|
||||
// role-init array element to get the same ChatDelta stamped, duplicating
|
||||
// the first content token. Applies to any streaming backend.
|
||||
if len(firstChunks) >= 2 {
|
||||
Expect(firstChunks[0]).NotTo(Equal(firstChunks[1]),
|
||||
"first content token was duplicated: %v", firstChunks)
|
||||
}
|
||||
GinkgoWriter.Printf("Stream: %d chunks, combined=%q\n", chunks, combined)
|
||||
})
|
||||
|
||||
// Logprobs: backends that wire OpenAI-compatible logprobs return a
|
||||
// JSON-encoded payload in Reply.logprobs (see backend.proto). The exact
|
||||
// shape is backend-specific; we only assert that the field is populated
|
||||
// when requested. Gated by capLogprobs because not every backend
|
||||
// implements it.
|
||||
It("returns logprobs when requested", func() {
|
||||
if !caps[capLogprobs] {
|
||||
Skip("logprobs capability not enabled")
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
|
||||
defer cancel()
|
||||
res, err := client.Predict(ctx, &pb.PredictOptions{
|
||||
Prompt: prompt,
|
||||
Tokens: 10,
|
||||
Temperature: 0.1,
|
||||
TopK: 40,
|
||||
TopP: 0.9,
|
||||
Logprobs: 1,
|
||||
TopLogprobs: 1,
|
||||
})
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(res.GetMessage()).NotTo(BeEmpty(), "Predict produced empty output")
|
||||
Expect(res.GetLogprobs()).NotTo(BeEmpty(), "Reply.logprobs was empty when requested")
|
||||
GinkgoWriter.Printf("Logprobs: %d bytes\n", len(res.GetLogprobs()))
|
||||
})
|
||||
|
||||
// Logit bias: encoded as a JSON string keyed by token id. We don't
|
||||
// know the model's tokenizer, so we exercise the API path with a
|
||||
// nonsense bias map that any backend should accept and ignore for
|
||||
// unknown ids. The assertion is that the request succeeds — proving
|
||||
// the LogitBias plumbing is wired end-to-end.
|
||||
It("accepts logit_bias when supplied", func() {
|
||||
if !caps[capLogitBias] {
|
||||
Skip("logit_bias capability not enabled")
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
|
||||
defer cancel()
|
||||
res, err := client.Predict(ctx, &pb.PredictOptions{
|
||||
Prompt: prompt,
|
||||
Tokens: 10,
|
||||
Temperature: 0.1,
|
||||
TopK: 40,
|
||||
TopP: 0.9,
|
||||
LogitBias: `{"1":-100}`,
|
||||
})
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(res.GetMessage()).NotTo(BeEmpty(), "Predict produced empty output with logit_bias")
|
||||
})
|
||||
|
||||
It("computes embeddings via Embedding", func() {
|
||||
if !caps[capEmbeddings] {
|
||||
Skip("embeddings capability not enabled")
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
{{.Input}}
|
||||
@@ -1,32 +0,0 @@
|
||||
- name: list1
|
||||
parameters:
|
||||
model: testmodel.ggml
|
||||
top_p: 80
|
||||
top_k: 0.9
|
||||
temperature: 0.1
|
||||
context_size: 200
|
||||
stopwords:
|
||||
- "HUMAN:"
|
||||
- "### Response:"
|
||||
roles:
|
||||
user: "HUMAN:"
|
||||
system: "GPT:"
|
||||
template:
|
||||
completion: completion
|
||||
chat: ggml-gpt4all-j
|
||||
- name: list2
|
||||
parameters:
|
||||
top_p: 80
|
||||
top_k: 0.9
|
||||
temperature: 0.1
|
||||
model: testmodel.ggml
|
||||
context_size: 200
|
||||
stopwords:
|
||||
- "HUMAN:"
|
||||
- "### Response:"
|
||||
roles:
|
||||
user: "HUMAN:"
|
||||
system: "GPT:"
|
||||
template:
|
||||
completion: completion
|
||||
chat: ggml-gpt4all-j
|
||||
@@ -1,4 +0,0 @@
|
||||
name: text-embedding-ada-002
|
||||
embeddings: true
|
||||
parameters:
|
||||
model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
|
||||
@@ -1,4 +0,0 @@
|
||||
The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
|
||||
### Prompt:
|
||||
{{.Input}}
|
||||
### Response:
|
||||
@@ -1,16 +0,0 @@
|
||||
name: gpt4all
|
||||
parameters:
|
||||
model: testmodel.ggml
|
||||
top_p: 80
|
||||
top_k: 0.9
|
||||
temperature: 0.1
|
||||
context_size: 200
|
||||
stopwords:
|
||||
- "HUMAN:"
|
||||
- "### Response:"
|
||||
roles:
|
||||
user: "HUMAN:"
|
||||
system: "GPT:"
|
||||
template:
|
||||
completion: completion
|
||||
chat: ggml-gpt4all-j
|
||||
@@ -1,16 +0,0 @@
|
||||
name: gpt4all-2
|
||||
parameters:
|
||||
model: testmodel.ggml
|
||||
top_p: 80
|
||||
top_k: 0.9
|
||||
temperature: 0.1
|
||||
context_size: 200
|
||||
stopwords:
|
||||
- "HUMAN:"
|
||||
- "### Response:"
|
||||
roles:
|
||||
user: "HUMAN:"
|
||||
system: "GPT:"
|
||||
template:
|
||||
completion: completion
|
||||
chat: ggml-gpt4all-j
|
||||
@@ -1,5 +0,0 @@
|
||||
name: code-search-ada-code-001
|
||||
backend: sentencetransformers
|
||||
embeddings: true
|
||||
parameters:
|
||||
model: all-MiniLM-L6-v2
|
||||
@@ -1,24 +0,0 @@
|
||||
name: rwkv_test
|
||||
parameters:
|
||||
model: huggingface://bartowski/rwkv-6-world-7b-GGUF/rwkv-6-world-7b-Q4_K_M.gguf
|
||||
top_k: 80
|
||||
temperature: 0.9
|
||||
max_tokens: 4098
|
||||
top_p: 0.8
|
||||
context_size: 4098
|
||||
|
||||
roles:
|
||||
user: "User: "
|
||||
system: "System: "
|
||||
assistant: "Assistant: "
|
||||
|
||||
stopwords:
|
||||
- 'Assistant:'
|
||||
- '<s>'
|
||||
|
||||
template:
|
||||
chat: |
|
||||
{{.Input}}
|
||||
Assistant:
|
||||
completion: |
|
||||
{{.Input}}
|
||||
@@ -1,4 +0,0 @@
|
||||
name: whisper-1
|
||||
backend: whisper
|
||||
parameters:
|
||||
model: whisper-en
|
||||
Reference in New Issue
Block a user