fix(streaming): comply with OpenAI usage / stream_options spec (#9815)

* fix(streaming): comply with OpenAI usage / stream_options spec (#8546) LocalAI emitted `"usage":{"prompt_tokens":0,...}` on every streamed chunk because `OpenAIResponse.Usage` was a value type without `omitempty`. The official OpenAI Node SDK and its consumers (continuedev/continue, Kilo Code, Roo Code, Zed, IntelliJ Continue) filter on a truthy `result.usage` to detect the trailing usage chunk; LocalAI's zero-but-non-null usage on every intermediate chunk made that filter swallow every content chunk and surface an empty chat response while the server log looked successful. Changes: - `core/schema/openai.go`: `Usage *OpenAIUsage \`json:"usage,omitempty"\`` so intermediate chunks no longer carry a `usage` key. Add `OpenAIRequest.StreamOptions` with `include_usage` to mirror OpenAI's request field. - `core/http/endpoints/openai/chat.go` and `completion.go`: keep using the `Usage` struct field as an in-process channel for the running cumulative, but strip it before JSON marshalling. When the request set `stream_options.include_usage: true`, emit a dedicated trailing chunk with `"choices": []` and the populated usage (matching the OpenAI spec and llama.cpp's server behavior). - `chat_emit.go`: new `streamUsageTrailerJSON` helper; drop the `usage` parameter from `buildNoActionFinalChunks` since chunks no longer carry usage. - Update `image.go`, `inpainting.go`, `edit.go` to wrap their Usage values with `&` for the new pointer field. - UI: send `stream_options:{include_usage:true}` from the React (`useChat.js`) and legacy (`static/chat.js`) chat clients so the token-count badge keeps populating now that the server is spec-compliant. Tests: - New `chat_stream_usage_test.go` pins the spec invariants: intermediate chunks have no `usage` key, the trailer JSON has `"choices":[]` and a populated `usage`, and `OpenAIRequest` parses `stream_options.include_usage`. - Update `chat_emit_test.go` to reflect that finals no longer embed usage. Verified against the live LocalAI instance: before the fix Continue's filter logic swallowed 16/16 token chunks; with the new shape it yields 4/5 and routes usage through the dedicated trailer chunk. Fixes #8546 Assisted-by: Claude:opus-4.7 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(streaming): silence errcheck on usage trailer Fprintf The new spec-compliant `stream_options.include_usage` trailer writes were flagged by errcheck since they're new code (golangci-lint runs new-from-merge-base on master); the surrounding `fmt.Fprintf` data: writes are grandfathered. Drop the return values explicitly to match the linter's contract without adding a nolint shim. Assisted-by: Claude:opus-4.7 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-16 12:38:01 -04:00 · 2026-05-14 08:53:46 +02:00
parent 6e1dbae256
commit 8af963bdd9
11 changed files with 342 additions and 63 deletions
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -131,13 +131,19 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				delta.Reasoning = &reasoningDelta
 			}

+			// Usage rides as a struct field for the consumer to track the
+			// running cumulative — it is stripped before JSON marshal so the
+			// wire chunk stays spec-compliant (no `usage` on intermediate
+			// chunks). The dedicated trailer chunk (when include_usage=true)
+			// carries the final totals.
+			usageForChunk := usage
 			resp := schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 				Choices: []schema.Choice{{Delta: delta, Index: 0, FinishReason: nil}},
 				Object:  "chat.completion.chunk",
-				Usage:   usage,
+				Usage:   &usageForChunk,
 			}

 			responses <- resp
@@ -164,7 +170,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		hasChatDeltaToolCalls := false
 		hasChatDeltaContent := false

-		_, tokenUsage, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+		_, _, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			result += s

 			// Track whether ChatDeltas from the C++ autoparser contain
@@ -387,16 +393,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

 		switch {
 		case noActionToRun:
-			usage := schema.OpenAIUsage{
-				PromptTokens:     tokenUsage.Prompt,
-				CompletionTokens: tokenUsage.Completion,
-				TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
-			}
-			if extraUsage {
-				usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
-				usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
-			}
-
+			// Token-cumulative usage is communicated to the streaming
+			// consumer via the per-token callback's chunk struct (stripped
+			// before wire marshal). The final usage trailer — when the
+			// caller opted in with stream_options.include_usage — is built
+			// by the outer streaming loop, not here.
 			var result string
 			if !sentInitialRole {
 				var hqErr error
@@ -409,7 +410,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			for _, chunk := range buildNoActionFinalChunks(
 				id, req.Model, created,
 				sentInitialRole, sentReasoning,
-				result, reasoning, usage,
+				result, reasoning,
 			) {
 				responses <- chunk
 			}
@@ -724,7 +725,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 							xlog.Debug("No choices in the response, skipping")
 							continue
 						}
-						usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
+						// Capture the running cumulative usage from this chunk
+						// (when present) so the include_usage trailer can carry
+						// the final totals. Usage is stripped before marshal
+						// below so the wire chunk stays spec-compliant.
+						if ev.Usage != nil {
+							usage = ev.Usage
+						}
 						if len(ev.Choices[0].Delta.ToolCalls) > 0 {
 							toolsCalled = true
 							// Collect and merge tool call deltas for MCP execution
@@ -740,6 +747,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 								collectedContent += *sp
 							}
 						}
+						// OpenAI streaming spec: intermediate chunks must NOT
+						// carry a `usage` field. Strip the tracking copy
+						// before marshalling — usage is delivered via the
+						// dedicated trailer chunk when include_usage=true.
+						ev.Usage = nil
 						respData, err := json.Marshal(ev)
 						if err != nil {
 							xlog.Debug("Failed to marshal response", "error", err)
@@ -888,6 +900,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					finishReason = FinishReasonFunctionCall
 				}

+				// Final delta chunk: empty delta with finish_reason set. Per
+				// OpenAI streaming spec this chunk does NOT carry usage —
+				// the optional trailer (below) does, gated on include_usage.
 				resp := &schema.OpenAIResponse{
 					ID:      id,
 					Created: created,
@@ -899,11 +914,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 							Delta:        &schema.Message{},
 						}},
 					Object: "chat.completion.chunk",
-					Usage:  *usage,
 				}
 				respData, _ := json.Marshal(resp)
-
 				fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData)
+
+				// Trailing usage chunk per OpenAI spec: emit only when the
+				// caller opted in via stream_options.include_usage. Shape:
+				// {"choices":[],"usage":{...},"object":"chat.completion.chunk",...}
+				if input.StreamOptions != nil && input.StreamOptions.IncludeUsage && usage != nil {
+					trailer := streamUsageTrailerJSON(id, input.Model, created, *usage)
+					_, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", trailer)
+				}
+
 				fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n")
 				c.Response().Flush()
 				xlog.Debug("Stream ended")
@@ -1263,7 +1285,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: result,
 					Object:  "chat.completion",
-					Usage:   usage,
+					Usage:   &usage,
 				}
 				respData, _ := json.Marshal(resp)
 				xlog.Debug("Response", "response", string(respData))
--- a/core/http/endpoints/openai/chat_emit.go
+++ b/core/http/endpoints/openai/chat_emit.go
@@ -1,12 +1,45 @@
 package openai

 import (
+	"encoding/json"
 	"fmt"

 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
 )

+// streamUsageTrailerJSON returns the bytes of the OpenAI-spec trailing usage
+// chunk emitted in streaming completions when the request opts in via
+// `stream_options.include_usage: true`. The shape is:
+//
+//	{"id":"...","object":"chat.completion.chunk","created":N,
+//	 "model":"...","choices":[],"usage":{...}}
+//
+// `choices` is intentionally an empty array (not absent, not null) — that is
+// what the OpenAI spec mandates, and what consumers like the official OpenAI
+// SDK and Continue's openai-adapter look for to recognise this as the usage
+// chunk rather than a content chunk. schema.OpenAIResponse has `omitempty`
+// on Choices, so we cannot reuse it for the trailer.
+func streamUsageTrailerJSON(id, model string, created int, usage schema.OpenAIUsage) []byte {
+	trailer := struct {
+		ID      string             `json:"id"`
+		Created int                `json:"created"`
+		Model   string             `json:"model"`
+		Object  string             `json:"object"`
+		Choices []schema.Choice    `json:"choices"`
+		Usage   schema.OpenAIUsage `json:"usage"`
+	}{
+		ID:      id,
+		Created: created,
+		Model:   model,
+		Object:  "chat.completion.chunk",
+		Choices: []schema.Choice{},
+		Usage:   usage,
+	}
+	b, _ := json.Marshal(trailer)
+	return b
+}
+
 // hasRealCall reports whether functionResults contains at least one
 // entry whose Name is something other than the noAction sentinel.
 // Used by processTools to decide between the "answer the question"
@@ -25,10 +58,10 @@ func hasRealCall(functionResults []functions.FuncCallResults, noAction string) b
 // pseudo-function or emitted no tool calls at all).
 //
 // When content was already streamed (contentAlreadyStreamed=true) the
-// helper emits a single trailing usage chunk, optionally carrying
-// reasoning that was produced but not streamed incrementally. When
-// content was not streamed it emits a role chunk followed by a
-// content+reasoning+usage chunk — the "send everything at once" fallback.
+// helper emits a trailing reasoning chunk if any non-streamed reasoning
+// remains, else nothing. When content was not streamed it emits a role
+// chunk followed by a content (+reasoning) chunk — the "send everything
+// at once" fallback.
 //
 // Reasoning re-emission is guarded by reasoningAlreadyStreamed, not by
 // probing the extractor's Go-side state: the C++ autoparser delivers
@@ -36,6 +69,10 @@ func hasRealCall(functionResults []functions.FuncCallResults, noAction string) b
 // separate accumulator that extractor.Reasoning() does not expose.
 // Without this guard the callback would stream reasoning incrementally
 // and the final chunk would duplicate it.
+//
+// The returned chunks intentionally do NOT carry a `usage` field. The
+// usage trailer is emitted separately by the streaming handler when
+// `stream_options.include_usage` is true, per OpenAI spec.
 func buildNoActionFinalChunks(
 	id, model string,
 	created int,
@@ -43,26 +80,26 @@ func buildNoActionFinalChunks(
 	reasoningAlreadyStreamed bool,
 	content string,
 	reasoning string,
-	usage schema.OpenAIUsage,
 ) []schema.OpenAIResponse {
 	var out []schema.OpenAIResponse

 	if contentAlreadyStreamed {
-		delta := &schema.Message{}
-		if reasoning != "" && !reasoningAlreadyStreamed {
-			r := reasoning
-			delta.Reasoning = &r
+		if reasoning == "" || reasoningAlreadyStreamed {
+			return nil
 		}
+		r := reasoning
 		out = append(out, schema.OpenAIResponse{
 			ID: id, Created: created, Model: model,
-			Choices: []schema.Choice{{Delta: delta, Index: 0}},
-			Object:  "chat.completion.chunk",
-			Usage:   usage,
+			Choices: []schema.Choice{{
+				Delta: &schema.Message{Reasoning: &r},
+				Index: 0,
+			}},
+			Object: "chat.completion.chunk",
 		})
 		return out
 	}

-	// Content was not streamed — send role, then content (+reasoning) + usage.
+	// Content was not streamed — send role, then content (+reasoning).
 	out = append(out, schema.OpenAIResponse{
 		ID: id, Created: created, Model: model,
 		Choices: []schema.Choice{{
@@ -82,7 +119,6 @@ func buildNoActionFinalChunks(
 		ID: id, Created: created, Model: model,
 		Choices: []schema.Choice{{Delta: delta, Index: 0}},
 		Object:  "chat.completion.chunk",
-		Usage:   usage,
 	})
 	return out
 }
--- a/core/http/endpoints/openai/chat_emit_test.go
+++ b/core/http/endpoints/openai/chat_emit_test.go
@@ -609,54 +609,52 @@ var _ = Describe("buildNoActionFinalChunks", func() {
 		testModel   = "test-model"
 		testCreated = 1700000000
 	)
-	usage := schema.OpenAIUsage{PromptTokens: 5, CompletionTokens: 7, TotalTokens: 12}

-	Describe("Content streamed — trailing usage chunk", func() {
-		It("emits just one chunk with usage, no content, no reasoning when reasoning was streamed", func() {
+	Describe("Content streamed — trailing reasoning only", func() {
+		It("emits nothing when content and reasoning were already streamed", func() {
+			// Before the streaming-usage-spec fix this branch emitted a
+			// content-less chunk solely to carry `usage`. Per the OpenAI
+			// spec usage no longer rides on delta chunks; the dedicated
+			// trailer (when include_usage=true) carries it instead — so
+			// with nothing to deliver the helper returns no chunks.
 			chunks := buildNoActionFinalChunks(
 				testID, testModel, testCreated,
 				true, true,
-				"", "already-streamed-reasoning", usage,
+				"", "already-streamed-reasoning",
 			)
-
-			Expect(chunks).To(HaveLen(1))
-			Expect(chunks[0].Usage.TotalTokens).To(Equal(12))
-			Expect(contentOf(chunks[0])).To(BeEmpty())
-			Expect(reasoningOf(chunks[0])).To(BeEmpty(),
-				"reasoning must not be re-emitted once it was streamed via the callback")
+			Expect(chunks).To(BeEmpty())
 		})

 		It("emits a trailing reasoning delivery when reasoning came only at end", func() {
 			chunks := buildNoActionFinalChunks(
 				testID, testModel, testCreated,
 				true, false,
-				"", "autoparser final reasoning", usage,
+				"", "autoparser final reasoning",
 			)

 			Expect(chunks).To(HaveLen(1))
 			Expect(reasoningOf(chunks[0])).To(Equal("autoparser final reasoning"))
 			Expect(contentOf(chunks[0])).To(BeEmpty())
-			Expect(chunks[0].Usage.TotalTokens).To(Equal(12))
+			Expect(chunks[0].Usage).To(BeNil(),
+				"intermediate chunks must not carry usage per OpenAI spec")
 		})

-		It("omits reasoning when it's empty regardless of streamed flag", func() {
+		It("returns no chunks when reasoning is empty and content was streamed", func() {
 			chunks := buildNoActionFinalChunks(
 				testID, testModel, testCreated,
 				true, false,
-				"", "", usage,
+				"", "",
 			)
-
-			Expect(chunks).To(HaveLen(1))
-			Expect(reasoningOf(chunks[0])).To(BeEmpty())
+			Expect(chunks).To(BeEmpty())
 		})
 	})

-	Describe("Content not streamed — role, then content+usage", func() {
+	Describe("Content not streamed — role, then content", func() {
 		It("emits role chunk then content chunk without reasoning when reasoning was streamed", func() {
 			chunks := buildNoActionFinalChunks(
 				testID, testModel, testCreated,
 				false, true,
-				"the answer", "already-streamed-reasoning", usage,
+				"the answer", "already-streamed-reasoning",
 			)

 			Expect(chunks).To(HaveLen(2))
@@ -666,14 +664,14 @@ var _ = Describe("buildNoActionFinalChunks", func() {
 			Expect(contentOf(chunks[1])).To(Equal("the answer"))
 			Expect(reasoningOf(chunks[1])).To(BeEmpty(),
 				"reasoning must not be re-emitted if it was streamed earlier")
-			Expect(chunks[1].Usage.TotalTokens).To(Equal(12))
+			Expect(chunks[1].Usage).To(BeNil())
 		})

 		It("emits role, then content+reasoning when reasoning was not streamed", func() {
 			chunks := buildNoActionFinalChunks(
 				testID, testModel, testCreated,
 				false, false,
-				"the answer", "autoparser final reasoning", usage,
+				"the answer", "autoparser final reasoning",
 			)

 			Expect(chunks).To(HaveLen(2))
@@ -681,14 +679,14 @@ var _ = Describe("buildNoActionFinalChunks", func() {

 			Expect(contentOf(chunks[1])).To(Equal("the answer"))
 			Expect(reasoningOf(chunks[1])).To(Equal("autoparser final reasoning"))
-			Expect(chunks[1].Usage.TotalTokens).To(Equal(12))
+			Expect(chunks[1].Usage).To(BeNil())
 		})

 		It("still emits content even when reasoning is empty", func() {
 			chunks := buildNoActionFinalChunks(
 				testID, testModel, testCreated,
 				false, false,
-				"just an answer", "", usage,
+				"just an answer", "",
 			)

 			Expect(chunks).To(HaveLen(2))
@@ -702,7 +700,7 @@ var _ = Describe("buildNoActionFinalChunks", func() {
 			chunks := buildNoActionFinalChunks(
 				testID, testModel, testCreated,
 				false, false,
-				"hi", "reasoning", usage,
+				"hi", "reasoning",
 			)
 			for i, ch := range chunks {
 				Expect(ch.ID).To(Equal(testID), "chunk[%d] ID", i)
--- a/core/http/endpoints/openai/chat_stream_usage_test.go
+++ b/core/http/endpoints/openai/chat_stream_usage_test.go
@@ -0,0 +1,179 @@
+package openai
+
+import (
+	"encoding/json"
+
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/functions"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// These tests pin LocalAI's streaming chunks to the OpenAI spec for the
+// `usage` field. The regression that motivated them (issue #8546) was that
+// LocalAI emitted `"usage":{...zeros...}` on every chunk, which made the
+// official OpenAI Node SDK consumers (Continue, Kilo Code, Roo Code, Zed,
+// IntelliJ Continue) drop every content chunk via the filter at
+// continuedev/continue packages/openai-adapters/src/apis/OpenAI.ts:275-288.
+//
+// Per OpenAI's chat-completion streaming contract:
+//   - intermediate chunks MUST NOT carry a `usage` field
+//   - usage is only delivered when the request opts in via
+//     `stream_options.include_usage: true`, on a final extra chunk whose
+//     `choices` is an empty array.
+
+var _ = Describe("streaming usage spec compliance", func() {
+	Describe("OpenAIResponse JSON shape", func() {
+		It("does not emit a 'usage' key when Usage is unset", func() {
+			// A typical intermediate token chunk: no Usage populated.
+			content := "hello"
+			resp := schema.OpenAIResponse{
+				ID:      "req-1",
+				Created: 1,
+				Model:   "m",
+				Object:  "chat.completion.chunk",
+				Choices: []schema.Choice{{
+					Index: 0,
+					Delta: &schema.Message{Content: &content},
+				}},
+			}
+			data, err := json.Marshal(resp)
+			Expect(err).ToNot(HaveOccurred())
+
+			var raw map[string]any
+			Expect(json.Unmarshal(data, &raw)).To(Succeed())
+			_, present := raw["usage"]
+			Expect(present).To(BeFalse(),
+				"intermediate chunk must not include a 'usage' key; got: %s", string(data))
+		})
+
+		It("emits the usage object when Usage is explicitly set", func() {
+			usage := &schema.OpenAIUsage{PromptTokens: 11, CompletionTokens: 22, TotalTokens: 33}
+			resp := schema.OpenAIResponse{
+				ID:      "req-1",
+				Created: 1,
+				Model:   "m",
+				Object:  "chat.completion.chunk",
+				Usage:   usage,
+			}
+			data, err := json.Marshal(resp)
+			Expect(err).ToNot(HaveOccurred())
+
+			var raw map[string]any
+			Expect(json.Unmarshal(data, &raw)).To(Succeed())
+			u, ok := raw["usage"].(map[string]any)
+			Expect(ok).To(BeTrue(), "expected 'usage' object, got: %s", string(data))
+			Expect(u["prompt_tokens"]).To(BeNumerically("==", 11))
+			Expect(u["completion_tokens"]).To(BeNumerically("==", 22))
+			Expect(u["total_tokens"]).To(BeNumerically("==", 33))
+		})
+	})
+
+	Describe("buildNoActionFinalChunks", func() {
+		It("returns chunks with no Usage embedded", func() {
+			// Whatever the caller is doing, helpers must not bake usage
+			// into intermediate or final delta chunks. The usage trailer
+			// (when requested via include_usage) is emitted separately.
+			chunks := buildNoActionFinalChunks(
+				"req-1", "m", 1,
+				false, false,
+				"hi", "",
+			)
+			Expect(chunks).ToNot(BeEmpty())
+			for i, ch := range chunks {
+				Expect(ch.Usage).To(BeNil(),
+					"chunk[%d] must not carry Usage; got %+v", i, ch.Usage)
+			}
+		})
+
+		It("returns chunks with no Usage when only trailing reasoning needs delivery", func() {
+			chunks := buildNoActionFinalChunks(
+				"req-1", "m", 1,
+				true, false,
+				"", "autoparser late reasoning",
+			)
+			Expect(chunks).ToNot(BeEmpty())
+			for i, ch := range chunks {
+				Expect(ch.Usage).To(BeNil(),
+					"chunk[%d] must not carry Usage; got %+v", i, ch.Usage)
+			}
+		})
+	})
+
+	Describe("buildDeferredToolCallChunks", func() {
+		It("returns chunks with no Usage embedded", func() {
+			calls := []functions.FuncCallResults{{
+				Name: "do_thing", Arguments: `{"x":1}`,
+			}}
+			chunks := buildDeferredToolCallChunks(
+				"req-1", "m", 1, calls, 0,
+				false, "", false, "",
+			)
+			Expect(chunks).ToNot(BeEmpty())
+			for i, ch := range chunks {
+				Expect(ch.Usage).To(BeNil(),
+					"chunk[%d] must not carry Usage; got %+v", i, ch.Usage)
+			}
+		})
+	})
+
+	Describe("streamUsageTrailerJSON", func() {
+		It("produces JSON matching the OpenAI spec for the trailer chunk", func() {
+			// Trailing usage chunk shape (OpenAI streaming spec):
+			//   {"id":"...","object":"chat.completion.chunk","created":...,
+			//    "model":"...","choices":[],"usage":{...}}
+			usage := schema.OpenAIUsage{
+				PromptTokens: 18, CompletionTokens: 14, TotalTokens: 32,
+			}
+			data := streamUsageTrailerJSON("req-1", "m", 1, usage)
+
+			var raw map[string]any
+			Expect(json.Unmarshal(data, &raw)).To(Succeed(),
+				"trailer must be valid JSON, got: %s", string(data))
+
+			Expect(raw["id"]).To(Equal("req-1"))
+			Expect(raw["model"]).To(Equal("m"))
+			Expect(raw["object"]).To(Equal("chat.completion.chunk"))
+			Expect(raw["created"]).To(BeNumerically("==", 1))
+
+			// `choices` MUST be present as an empty array (not absent, not null).
+			rawChoices, present := raw["choices"]
+			Expect(present).To(BeTrue(), "choices key must be present, got: %s", string(data))
+			choicesArr, ok := rawChoices.([]any)
+			Expect(ok).To(BeTrue(), "choices must serialize as an array, got: %s", string(data))
+			Expect(choicesArr).To(BeEmpty(), "choices must be empty in usage trailer, got: %s", string(data))
+
+			// `usage` MUST be present and non-null with the populated counts.
+			u, ok := raw["usage"].(map[string]any)
+			Expect(ok).To(BeTrue(), "usage object must be present, got: %s", string(data))
+			Expect(u["prompt_tokens"]).To(BeNumerically("==", 18))
+			Expect(u["completion_tokens"]).To(BeNumerically("==", 14))
+			Expect(u["total_tokens"]).To(BeNumerically("==", 32))
+		})
+	})
+
+	Describe("OpenAIRequest.StreamOptions", func() {
+		It("parses stream_options.include_usage=true", func() {
+			body := []byte(`{
+                "model": "m",
+                "stream": true,
+                "stream_options": {"include_usage": true},
+                "messages": []
+            }`)
+			var req schema.OpenAIRequest
+			Expect(json.Unmarshal(body, &req)).To(Succeed())
+			Expect(req.StreamOptions).ToNot(BeNil())
+			Expect(req.StreamOptions.IncludeUsage).To(BeTrue())
+		})
+
+		It("defaults IncludeUsage to false when stream_options is absent", func() {
+			body := []byte(`{"model":"m","stream":true,"messages":[]}`)
+			var req schema.OpenAIRequest
+			Expect(json.Unmarshal(body, &req)).To(Succeed())
+			// Either a nil StreamOptions or one with IncludeUsage=false is acceptable.
+			if req.StreamOptions != nil {
+				Expect(req.StreamOptions.IncludeUsage).To(BeFalse())
+			}
+		})
+	})
+})
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -39,6 +39,10 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 				usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
 				usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
 			}
+			// Usage rides on the struct for the consumer to track the
+			// running cumulative; the consumer strips it before marshalling
+			// so intermediate chunks stay OpenAI-spec compliant.
+			usageForChunk := usage
 			resp := schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
@@ -51,7 +55,7 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 					},
 				},
 				Object: "text_completion",
-				Usage:  usage,
+				Usage:  &usageForChunk,
 			}
 			xlog.Debug("Sending goroutine", "text", s)

@@ -127,6 +131,8 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 				ended <- process(id, predInput, input, config, ml, responses, extraUsage)
 			}()

+			var latestUsage *schema.OpenAIUsage
+
 		LOOP:
 			for {
 				select {
@@ -135,6 +141,14 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 						xlog.Debug("No choices in the response, skipping")
 						continue
 					}
+					// Capture running cumulative usage for the optional trailer
+					// emitted after the final stop chunk when include_usage=true.
+					if ev.Usage != nil {
+						latestUsage = ev.Usage
+					}
+					// OpenAI streaming spec: intermediate chunks must NOT
+					// carry a `usage` field. Strip the tracking copy now.
+					ev.Usage = nil
 					respData, err := json.Marshal(ev)
 					if err != nil {
 						xlog.Debug("Failed to marshal response", "error", err)
@@ -194,8 +208,15 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 				Object: "text_completion",
 			}
 			respData, _ := json.Marshal(resp)
-
 			fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData)
+
+			// Trailing usage chunk per OpenAI spec: emit only when the caller
+			// opted in via stream_options.include_usage.
+			if input.StreamOptions != nil && input.StreamOptions.IncludeUsage && latestUsage != nil {
+				trailer := streamUsageTrailerJSON(id, input.Model, created, *latestUsage)
+				_, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", trailer)
+			}
+
 			fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n")
 			c.Response().Flush()
 			return nil
@@ -247,7 +268,7 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "text_completion",
-			Usage:   usage,
+			Usage:   &usage,
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/core/http/endpoints/openai/edit.go
+++ b/core/http/endpoints/openai/edit.go
@@ -92,7 +92,7 @@ func EditEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "edit",
-			Usage:   usage,
+			Usage:   &usage,
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/core/http/endpoints/openai/image.go
+++ b/core/http/endpoints/openai/image.go
@@ -233,7 +233,7 @@ func ImageEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfi
 			ID:      id,
 			Created: created,
 			Data:    result,
-			Usage: schema.OpenAIUsage{
+			Usage: &schema.OpenAIUsage{
 				PromptTokens:     0,
 				CompletionTokens: 0,
 				TotalTokens:      0,
--- a/core/http/endpoints/openai/inpainting.go
+++ b/core/http/endpoints/openai/inpainting.go
@@ -258,7 +258,7 @@ func InpaintingEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app
 			Data: []schema.Item{{
 				URL: imgPath,
 			}},
-			Usage: schema.OpenAIUsage{
+			Usage: &schema.OpenAIUsage{
 				PromptTokens:     0,
 				CompletionTokens: 0,
 				TotalTokens:      0,
--- a/core/http/react-ui/src/hooks/useChat.js
+++ b/core/http/react-ui/src/hooks/useChat.js
@@ -255,7 +255,10 @@ export function useChat(initialModel = '') {
    )
    messages.push(...historyForApi, { role: 'user', content: messageContent })

-    const requestBody = { model, messages, stream: true }
+    // include_usage tells LocalAI to emit a trailing chunk with token totals;
+    // without it the spec-compliant server drops `usage` from the stream and
+    // the token-count badge would never populate.
+    const requestBody = { model, messages, stream: true, stream_options: { include_usage: true } }
    if (temperature !== null && temperature !== undefined) requestBody.temperature = temperature
    if (topP !== null && topP !== undefined) requestBody.top_p = topP
    if (topK !== null && topK !== undefined) requestBody.top_k = topK
--- a/core/http/static/chat.js
+++ b/core/http/static/chat.js
@@ -1212,6 +1212,9 @@ async function promptGPT(systemPrompt, input) {

  // Add stream parameter for both regular chat and MCP (MCP now supports SSE streaming)
  requestBody.stream = true;
+  // include_usage tells LocalAI to emit a trailing chunk with token totals;
+  // the spec-compliant server otherwise drops `usage` from the stream.
+  requestBody.stream_options = { include_usage: true };
  
  // Add generation parameters if they are set (null means use default)
  if (activeChat.temperature !== null && activeChat.temperature !== undefined) {
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -82,7 +82,21 @@ type OpenAIResponse struct {
 	Choices []Choice `json:"choices,omitempty"`
 	Data    []Item   `json:"data,omitempty"`

-	Usage OpenAIUsage `json:"usage"`
+	// Usage is intentionally a pointer with omitempty: per the OpenAI
+	// chat-completion streaming spec, intermediate chunks must not carry
+	// a `usage` field. Marshalling a value-typed usage would emit
+	// `"usage":{"prompt_tokens":0,...}` on every chunk and break
+	// OpenAI-SDK consumers that filter on a truthy `result.usage`
+	// (continuedev/continue, Kilo Code, Roo Code, etc.).
+	Usage *OpenAIUsage `json:"usage,omitempty"`
+}
+
+// StreamOptions mirrors OpenAI's `stream_options` request field. The only
+// member currently honored is IncludeUsage; when true, the streaming
+// chat-completion response emits a trailing chunk with `choices:[]` and a
+// populated `usage` object.
+type StreamOptions struct {
+	IncludeUsage bool `json:"include_usage,omitempty" yaml:"include_usage,omitempty"`
 }

 type Choice struct {
@@ -198,6 +212,9 @@ type OpenAIRequest struct {

 	Stream bool `json:"stream"`

+	// StreamOptions opts into OpenAI streaming extensions, e.g. include_usage.
+	StreamOptions *StreamOptions `json:"stream_options,omitempty" yaml:"stream_options,omitempty"`
+
 	// Image (not supported by OpenAI)
 	Quality string `json:"quality"`
 	Step    int    `json:"step"`