refactor(tests): split app_test.go, move real-backend coverage to e2e-backends

core/http/app_test.go had grown to 1495 lines exercising three concerns at once: HTTP-layer integration, real-backend inference (llama-gguf, tts, stablediffusion, transformers embeddings, whisper), and service logic that already has unit-level coverage. Each PR paid for 6 backend builds plus real-model downloads to satisfy a single suite. Reorg per layer: - app_test.go (1495 -> 1003 lines) drives the mock-backend binary only. Kept: auth, routing, gallery API, file:// import, /system, agent-jobs HTTP plumbing, config-file model loading. Deleted real-inference specs (llama-gguf chat, ggml completions/streaming, logprobs, logit_bias, transcription, embeddings, External-gRPC, Stores duplicate, Model gallery Context). Lifted Agent Jobs out of the deleted Stores Context. - tests/e2e-backends/backend_test.go gains logprobs, logit_bias, and no-first-token-dup specs (the latter folded into PredictStream). Two new caps gate them so non-LLM backends opt out. - tests/e2e-aio/e2e_test.go gains a streaming smoke under Context("text") to catch container-level streaming regressions. - tests/models_fixtures/ removed; all fixtures referenced testmodel.ggml. app_test.go now writes per-Context inline mock-model YAMLs. CI: - test.yml + tests-e2e.yml gain paths-ignore (docs/, examples/, *.md, backend/) so docs and backend-only PRs skip them. test.yml drops the 6-backend Build step plus TRANSFORMER_BACKEND/GO_TAGS=tts; tests-apple drops the llama-cpp-darwin build. - New tests-aio.yml runs the AIO container nightly + on workflow_dispatch + master/tags. The tests-e2e-container job moved out of test.yml so PRs no longer pay AIO cost. - New tests-llama-cpp-smoke job in test-extra.yml runs on every PR with no detect-changes gate; pulls quay.io/go-skynet/local-ai-backends: master-cpu-llama-cpp (no build on PR) and exercises predict/stream/ logprobs/logit_bias against Qwen3-0.6B. This is the PR-acceptance real-backend gate after AIO moved to nightly. The path-gated heavy test-extra-backend-llama-cpp wrapper appends the same caps so it exercises the moved specs when the backend actually changes. Makefile: - Deleted test-models/testmodel.ggml (the wget chain), test-llama-gguf, test-tts, test-stablediffusion, test-realtime-models. test target drops --label-filter, HUGGINGFACE_GRPC, TRANSFORMER_BACKEND, TEST_DIR, FIXTURES, CONFIG_FILE, MODELS_PATH, BACKENDS_PATH; depends on build-mock-backend. test-stores keeps a focused entry point and depends on backends/local-store. clean-tests also clears the mock-backend binary. Net per typical Go-side PR: ~25min (6 backend builds + tests + AIO) + ~8min e2e drops to ~5min mock-backend test + ~8min e2e + ~5-10min llama-cpp-smoke (image pulled). Docs and backend-only PRs skip the always-on workflows entirely. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: claude-code:claude-opus-4-7 [Edit] [Write] [Bash]
2026-04-30 03:55:58 -04:00 · 2026-04-27 23:09:20 +00:00
parent 3948b580d2
commit a0317d9926
18 changed files with 451 additions and 880 deletions
--- a/tests/e2e-aio/e2e_test.go
+++ b/tests/e2e-aio/e2e_test.go
@@ -41,6 +41,34 @@ var _ = Describe("E2E test", func() {
 				Expect(len(resp.Choices)).To(Equal(1), fmt.Sprint(resp))
 				Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("4"), ContainSubstring("four")), fmt.Sprint(resp.Choices[0].Message.Content))
 			})
+
+			// Smoke: verifies the AIO container streams chat completions end-to-end.
+			// Catches packaging/proxy regressions where the streaming path breaks
+			// even though non-streaming works.
+			It("streams correctly", func() {
+				model := "gpt-4"
+				stream := client.Chat.Completions.NewStreaming(context.TODO(),
+					openai.ChatCompletionNewParams{
+						Model: model,
+						Messages: []openai.ChatCompletionMessageParamUnion{
+							openai.UserMessage("Count to three."),
+						},
+					})
+				defer stream.Close()
+
+				var chunks int
+				var combined string
+				for stream.Next() {
+					chunk := stream.Current()
+					if len(chunk.Choices) > 0 && chunk.Choices[0].Delta.Content != "" {
+						chunks++
+						combined += chunk.Choices[0].Delta.Content
+					}
+				}
+				Expect(stream.Err()).ToNot(HaveOccurred())
+				Expect(chunks).To(BeNumerically(">", 1), "expected multi-chunk stream, got %d", chunks)
+				Expect(combined).ToNot(BeEmpty(), "stream produced no content")
+			})
 		})

 		Context("function calls", func() {
--- a/tests/e2e-backends/backend_test.go
+++ b/tests/e2e-backends/backend_test.go
@@ -102,6 +102,8 @@ const (
 	capVoiceEmbed    = "voice_embed"
 	capVoiceVerify   = "voice_verify"
 	capVoiceAnalyze  = "voice_analyze"
+	capLogprobs      = "logprobs"
+	capLogitBias     = "logit_bias"

 	defaultPrompt             = "The capital of France is"
 	streamPrompt              = "Once upon a time"
@@ -422,6 +424,7 @@ var _ = Describe("Backend container", Ordered, func() {

 		var chunks int
 		var combined string
+		var firstChunks []string
 		for {
 			msg, err := stream.Recv()
 			if err == io.EOF {
@@ -431,12 +434,71 @@ var _ = Describe("Backend container", Ordered, func() {
 			if len(msg.GetMessage()) > 0 {
 				chunks++
 				combined += string(msg.GetMessage())
+				if len(firstChunks) < 2 {
+					firstChunks = append(firstChunks, string(msg.GetMessage()))
+				}
 			}
 		}
 		Expect(chunks).To(BeNumerically(">", 0), "no stream chunks received")
+		// Regression guard: a bug in llama-cpp's grpc-server.cpp caused the
+		// role-init array element to get the same ChatDelta stamped, duplicating
+		// the first content token. Applies to any streaming backend.
+		if len(firstChunks) >= 2 {
+			Expect(firstChunks[0]).NotTo(Equal(firstChunks[1]),
+				"first content token was duplicated: %v", firstChunks)
+		}
 		GinkgoWriter.Printf("Stream: %d chunks, combined=%q\n", chunks, combined)
 	})

+	// Logprobs: backends that wire OpenAI-compatible logprobs return a
+	// JSON-encoded payload in Reply.logprobs (see backend.proto). The exact
+	// shape is backend-specific; we only assert that the field is populated
+	// when requested. Gated by capLogprobs because not every backend
+	// implements it.
+	It("returns logprobs when requested", func() {
+		if !caps[capLogprobs] {
+			Skip("logprobs capability not enabled")
+		}
+		ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
+		defer cancel()
+		res, err := client.Predict(ctx, &pb.PredictOptions{
+			Prompt:      prompt,
+			Tokens:      10,
+			Temperature: 0.1,
+			TopK:        40,
+			TopP:        0.9,
+			Logprobs:    1,
+			TopLogprobs: 1,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.GetMessage()).NotTo(BeEmpty(), "Predict produced empty output")
+		Expect(res.GetLogprobs()).NotTo(BeEmpty(), "Reply.logprobs was empty when requested")
+		GinkgoWriter.Printf("Logprobs: %d bytes\n", len(res.GetLogprobs()))
+	})
+
+	// Logit bias: encoded as a JSON string keyed by token id. We don't
+	// know the model's tokenizer, so we exercise the API path with a
+	// nonsense bias map that any backend should accept and ignore for
+	// unknown ids. The assertion is that the request succeeds — proving
+	// the LogitBias plumbing is wired end-to-end.
+	It("accepts logit_bias when supplied", func() {
+		if !caps[capLogitBias] {
+			Skip("logit_bias capability not enabled")
+		}
+		ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
+		defer cancel()
+		res, err := client.Predict(ctx, &pb.PredictOptions{
+			Prompt:      prompt,
+			Tokens:      10,
+			Temperature: 0.1,
+			TopK:        40,
+			TopP:        0.9,
+			LogitBias:   `{"1":-100}`,
+		})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(res.GetMessage()).NotTo(BeEmpty(), "Predict produced empty output with logit_bias")
+	})
+
 	It("computes embeddings via Embedding", func() {
 		if !caps[capEmbeddings] {
 			Skip("embeddings capability not enabled")
--- a/tests/models_fixtures/completion.tmpl
+++ b/tests/models_fixtures/completion.tmpl
@@ -1 +0,0 @@
-{{.Input}}
--- a/tests/models_fixtures/config.yaml
+++ b/tests/models_fixtures/config.yaml
@@ -1,32 +0,0 @@
- name: list1
-  parameters:
-    model: testmodel.ggml
-    top_p: 80
-    top_k: 0.9
-    temperature: 0.1
-  context_size: 200
-  stopwords:
-  - "HUMAN:"
-  - "### Response:"
-  roles:
-    user: "HUMAN:"
-    system: "GPT:"
-  template:
-    completion: completion
-    chat: ggml-gpt4all-j
- name: list2
-  parameters:
-    top_p: 80
-    top_k: 0.9
-    temperature: 0.1
-    model: testmodel.ggml
-  context_size: 200
-  stopwords:
-  - "HUMAN:"
-  - "### Response:"
-  roles:
-    user: "HUMAN:"
-    system: "GPT:"
-  template:
-    completion: completion
-    chat: ggml-gpt4all-j
--- a/tests/models_fixtures/embeddings.yaml
+++ b/tests/models_fixtures/embeddings.yaml
@@ -1,4 +0,0 @@
-name: text-embedding-ada-002
-embeddings: true
-parameters:
-  model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
--- a/tests/models_fixtures/ggml-gpt4all-j.tmpl
+++ b/tests/models_fixtures/ggml-gpt4all-j.tmpl
@@ -1,4 +0,0 @@
-The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
-### Prompt:
-{{.Input}}
-### Response:
--- a/tests/models_fixtures/gpt4.yaml
+++ b/tests/models_fixtures/gpt4.yaml
@@ -1,16 +0,0 @@
-name: gpt4all
-parameters:
-  model: testmodel.ggml
-  top_p: 80
-  top_k: 0.9
-  temperature: 0.1
-context_size: 200
-stopwords:
- "HUMAN:"
- "### Response:"
-roles:
-  user: "HUMAN:"
-  system: "GPT:"
-template:
-  completion: completion
-  chat: ggml-gpt4all-j
--- a/tests/models_fixtures/gpt4_2.yaml
+++ b/tests/models_fixtures/gpt4_2.yaml
@@ -1,16 +0,0 @@
-name: gpt4all-2
-parameters:
-  model: testmodel.ggml
-  top_p: 80
-  top_k: 0.9
-  temperature: 0.1
-context_size: 200
-stopwords:
- "HUMAN:"
- "### Response:"
-roles:
-  user: "HUMAN:"
-  system: "GPT:"
-template:
-  completion: completion
-  chat: ggml-gpt4all-j
--- a/tests/models_fixtures/grpc.yaml
+++ b/tests/models_fixtures/grpc.yaml
@@ -1,5 +0,0 @@
-name: code-search-ada-code-001
-backend: sentencetransformers
-embeddings: true
-parameters:
-  model: all-MiniLM-L6-v2
--- a/tests/models_fixtures/rwkv.yaml
+++ b/tests/models_fixtures/rwkv.yaml
@@ -1,24 +0,0 @@
-name: rwkv_test
-parameters:
-  model: huggingface://bartowski/rwkv-6-world-7b-GGUF/rwkv-6-world-7b-Q4_K_M.gguf
-  top_k: 80
-  temperature: 0.9
-  max_tokens: 4098
-  top_p: 0.8
-context_size: 4098
-
-roles:
-  user: "User: "
-  system: "System: "
-  assistant: "Assistant: "
-
-stopwords:
- 'Assistant:'
- '<s>'
-
-template:
-  chat: |
-    {{.Input}}
-    Assistant: 
-  completion: |
-    {{.Input}}
--- a/tests/models_fixtures/whisper.yaml
+++ b/tests/models_fixtures/whisper.yaml
@@ -1,4 +0,0 @@
-name: whisper-1
-backend: whisper
-parameters:
-  model: whisper-en