fix(streaming): comply with OpenAI usage / stream_options spec (#9815)

* fix(streaming): comply with OpenAI usage / stream_options spec (#8546)

LocalAI emitted `"usage":{"prompt_tokens":0,...}` on every streamed
chunk because `OpenAIResponse.Usage` was a value type without
`omitempty`. The official OpenAI Node SDK and its consumers
(continuedev/continue, Kilo Code, Roo Code, Zed, IntelliJ Continue)
filter on a truthy `result.usage` to detect the trailing usage chunk;
LocalAI's zero-but-non-null usage on every intermediate chunk made
that filter swallow every content chunk and surface an empty chat
response while the server log looked successful.

Changes:

- `core/schema/openai.go`: `Usage *OpenAIUsage \`json:"usage,omitempty"\``
  so intermediate chunks no longer carry a `usage` key. Add
  `OpenAIRequest.StreamOptions` with `include_usage` to mirror OpenAI's
  request field.
- `core/http/endpoints/openai/chat.go` and `completion.go`: keep using
  the `Usage` struct field as an in-process channel for the running
  cumulative, but strip it before JSON marshalling. When the request
  set `stream_options.include_usage: true`, emit a dedicated trailing
  chunk with `"choices": []` and the populated usage (matching the
  OpenAI spec and llama.cpp's server behavior).
- `chat_emit.go`: new `streamUsageTrailerJSON` helper; drop the
  `usage` parameter from `buildNoActionFinalChunks` since chunks no
  longer carry usage.
- Update `image.go`, `inpainting.go`, `edit.go` to wrap their Usage
  values with `&` for the new pointer field.
- UI: send `stream_options:{include_usage:true}` from the React
  (`useChat.js`) and legacy (`static/chat.js`) chat clients so the
  token-count badge keeps populating now that the server is
  spec-compliant.

Tests:

- New `chat_stream_usage_test.go` pins the spec invariants:
  intermediate chunks have no `usage` key, the trailer JSON has
  `"choices":[]` and a populated `usage`, and `OpenAIRequest` parses
  `stream_options.include_usage`.
- Update `chat_emit_test.go` to reflect that finals no longer embed
  usage.

Verified against the live LocalAI instance: before the fix Continue's
filter logic swallowed 16/16 token chunks; with the new shape it
yields 4/5 and routes usage through the dedicated trailer chunk.

Fixes #8546

Assisted-by: Claude:opus-4.7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(streaming): silence errcheck on usage trailer Fprintf

The new spec-compliant `stream_options.include_usage` trailer writes
were flagged by errcheck since they're new code (golangci-lint runs
new-from-merge-base on master); the surrounding `fmt.Fprintf` data:
writes are grandfathered. Drop the return values explicitly to match
the linter's contract without adding a nolint shim.

Assisted-by: Claude:opus-4.7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
LocalAI [bot]
2026-05-14 08:53:46 +02:00
committed by GitHub
parent 6e1dbae256
commit 8af963bdd9
11 changed files with 342 additions and 63 deletions

View File

@@ -131,13 +131,19 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
delta.Reasoning = &reasoningDelta
}
// Usage rides as a struct field for the consumer to track the
// running cumulative — it is stripped before JSON marshal so the
// wire chunk stays spec-compliant (no `usage` on intermediate
// chunks). The dedicated trailer chunk (when include_usage=true)
// carries the final totals.
usageForChunk := usage
resp := schema.OpenAIResponse{
ID: id,
Created: created,
Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: []schema.Choice{{Delta: delta, Index: 0, FinishReason: nil}},
Object: "chat.completion.chunk",
Usage: usage,
Usage: &usageForChunk,
}
responses <- resp
@@ -164,7 +170,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
hasChatDeltaToolCalls := false
hasChatDeltaContent := false
_, tokenUsage, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
_, _, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
result += s
// Track whether ChatDeltas from the C++ autoparser contain
@@ -387,16 +393,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
switch {
case noActionToRun:
usage := schema.OpenAIUsage{
PromptTokens: tokenUsage.Prompt,
CompletionTokens: tokenUsage.Completion,
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
}
if extraUsage {
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
}
// Token-cumulative usage is communicated to the streaming
// consumer via the per-token callback's chunk struct (stripped
// before wire marshal). The final usage trailer — when the
// caller opted in with stream_options.include_usage — is built
// by the outer streaming loop, not here.
var result string
if !sentInitialRole {
var hqErr error
@@ -409,7 +410,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
for _, chunk := range buildNoActionFinalChunks(
id, req.Model, created,
sentInitialRole, sentReasoning,
result, reasoning, usage,
result, reasoning,
) {
responses <- chunk
}
@@ -724,7 +725,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
xlog.Debug("No choices in the response, skipping")
continue
}
usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
// Capture the running cumulative usage from this chunk
// (when present) so the include_usage trailer can carry
// the final totals. Usage is stripped before marshal
// below so the wire chunk stays spec-compliant.
if ev.Usage != nil {
usage = ev.Usage
}
if len(ev.Choices[0].Delta.ToolCalls) > 0 {
toolsCalled = true
// Collect and merge tool call deltas for MCP execution
@@ -740,6 +747,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
collectedContent += *sp
}
}
// OpenAI streaming spec: intermediate chunks must NOT
// carry a `usage` field. Strip the tracking copy
// before marshalling — usage is delivered via the
// dedicated trailer chunk when include_usage=true.
ev.Usage = nil
respData, err := json.Marshal(ev)
if err != nil {
xlog.Debug("Failed to marshal response", "error", err)
@@ -888,6 +900,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
finishReason = FinishReasonFunctionCall
}
// Final delta chunk: empty delta with finish_reason set. Per
// OpenAI streaming spec this chunk does NOT carry usage —
// the optional trailer (below) does, gated on include_usage.
resp := &schema.OpenAIResponse{
ID: id,
Created: created,
@@ -899,11 +914,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
Delta: &schema.Message{},
}},
Object: "chat.completion.chunk",
Usage: *usage,
}
respData, _ := json.Marshal(resp)
fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData)
// Trailing usage chunk per OpenAI spec: emit only when the
// caller opted in via stream_options.include_usage. Shape:
// {"choices":[],"usage":{...},"object":"chat.completion.chunk",...}
if input.StreamOptions != nil && input.StreamOptions.IncludeUsage && usage != nil {
trailer := streamUsageTrailerJSON(id, input.Model, created, *usage)
_, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", trailer)
}
fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n")
c.Response().Flush()
xlog.Debug("Stream ended")
@@ -1263,7 +1285,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: result,
Object: "chat.completion",
Usage: usage,
Usage: &usage,
}
respData, _ := json.Marshal(resp)
xlog.Debug("Response", "response", string(respData))

View File

@@ -1,12 +1,45 @@
package openai
import (
"encoding/json"
"fmt"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/pkg/functions"
)
// streamUsageTrailerJSON returns the bytes of the OpenAI-spec trailing usage
// chunk emitted in streaming completions when the request opts in via
// `stream_options.include_usage: true`. The shape is:
//
// {"id":"...","object":"chat.completion.chunk","created":N,
// "model":"...","choices":[],"usage":{...}}
//
// `choices` is intentionally an empty array (not absent, not null) — that is
// what the OpenAI spec mandates, and what consumers like the official OpenAI
// SDK and Continue's openai-adapter look for to recognise this as the usage
// chunk rather than a content chunk. schema.OpenAIResponse has `omitempty`
// on Choices, so we cannot reuse it for the trailer.
func streamUsageTrailerJSON(id, model string, created int, usage schema.OpenAIUsage) []byte {
trailer := struct {
ID string `json:"id"`
Created int `json:"created"`
Model string `json:"model"`
Object string `json:"object"`
Choices []schema.Choice `json:"choices"`
Usage schema.OpenAIUsage `json:"usage"`
}{
ID: id,
Created: created,
Model: model,
Object: "chat.completion.chunk",
Choices: []schema.Choice{},
Usage: usage,
}
b, _ := json.Marshal(trailer)
return b
}
// hasRealCall reports whether functionResults contains at least one
// entry whose Name is something other than the noAction sentinel.
// Used by processTools to decide between the "answer the question"
@@ -25,10 +58,10 @@ func hasRealCall(functionResults []functions.FuncCallResults, noAction string) b
// pseudo-function or emitted no tool calls at all).
//
// When content was already streamed (contentAlreadyStreamed=true) the
// helper emits a single trailing usage chunk, optionally carrying
// reasoning that was produced but not streamed incrementally. When
// content was not streamed it emits a role chunk followed by a
// content+reasoning+usage chunk — the "send everything at once" fallback.
// helper emits a trailing reasoning chunk if any non-streamed reasoning
// remains, else nothing. When content was not streamed it emits a role
// chunk followed by a content (+reasoning) chunk — the "send everything
// at once" fallback.
//
// Reasoning re-emission is guarded by reasoningAlreadyStreamed, not by
// probing the extractor's Go-side state: the C++ autoparser delivers
@@ -36,6 +69,10 @@ func hasRealCall(functionResults []functions.FuncCallResults, noAction string) b
// separate accumulator that extractor.Reasoning() does not expose.
// Without this guard the callback would stream reasoning incrementally
// and the final chunk would duplicate it.
//
// The returned chunks intentionally do NOT carry a `usage` field. The
// usage trailer is emitted separately by the streaming handler when
// `stream_options.include_usage` is true, per OpenAI spec.
func buildNoActionFinalChunks(
id, model string,
created int,
@@ -43,26 +80,26 @@ func buildNoActionFinalChunks(
reasoningAlreadyStreamed bool,
content string,
reasoning string,
usage schema.OpenAIUsage,
) []schema.OpenAIResponse {
var out []schema.OpenAIResponse
if contentAlreadyStreamed {
delta := &schema.Message{}
if reasoning != "" && !reasoningAlreadyStreamed {
r := reasoning
delta.Reasoning = &r
if reasoning == "" || reasoningAlreadyStreamed {
return nil
}
r := reasoning
out = append(out, schema.OpenAIResponse{
ID: id, Created: created, Model: model,
Choices: []schema.Choice{{Delta: delta, Index: 0}},
Object: "chat.completion.chunk",
Usage: usage,
Choices: []schema.Choice{{
Delta: &schema.Message{Reasoning: &r},
Index: 0,
}},
Object: "chat.completion.chunk",
})
return out
}
// Content was not streamed — send role, then content (+reasoning) + usage.
// Content was not streamed — send role, then content (+reasoning).
out = append(out, schema.OpenAIResponse{
ID: id, Created: created, Model: model,
Choices: []schema.Choice{{
@@ -82,7 +119,6 @@ func buildNoActionFinalChunks(
ID: id, Created: created, Model: model,
Choices: []schema.Choice{{Delta: delta, Index: 0}},
Object: "chat.completion.chunk",
Usage: usage,
})
return out
}

View File

@@ -609,54 +609,52 @@ var _ = Describe("buildNoActionFinalChunks", func() {
testModel = "test-model"
testCreated = 1700000000
)
usage := schema.OpenAIUsage{PromptTokens: 5, CompletionTokens: 7, TotalTokens: 12}
Describe("Content streamed — trailing usage chunk", func() {
It("emits just one chunk with usage, no content, no reasoning when reasoning was streamed", func() {
Describe("Content streamed — trailing reasoning only", func() {
It("emits nothing when content and reasoning were already streamed", func() {
// Before the streaming-usage-spec fix this branch emitted a
// content-less chunk solely to carry `usage`. Per the OpenAI
// spec usage no longer rides on delta chunks; the dedicated
// trailer (when include_usage=true) carries it instead — so
// with nothing to deliver the helper returns no chunks.
chunks := buildNoActionFinalChunks(
testID, testModel, testCreated,
true, true,
"", "already-streamed-reasoning", usage,
"", "already-streamed-reasoning",
)
Expect(chunks).To(HaveLen(1))
Expect(chunks[0].Usage.TotalTokens).To(Equal(12))
Expect(contentOf(chunks[0])).To(BeEmpty())
Expect(reasoningOf(chunks[0])).To(BeEmpty(),
"reasoning must not be re-emitted once it was streamed via the callback")
Expect(chunks).To(BeEmpty())
})
It("emits a trailing reasoning delivery when reasoning came only at end", func() {
chunks := buildNoActionFinalChunks(
testID, testModel, testCreated,
true, false,
"", "autoparser final reasoning", usage,
"", "autoparser final reasoning",
)
Expect(chunks).To(HaveLen(1))
Expect(reasoningOf(chunks[0])).To(Equal("autoparser final reasoning"))
Expect(contentOf(chunks[0])).To(BeEmpty())
Expect(chunks[0].Usage.TotalTokens).To(Equal(12))
Expect(chunks[0].Usage).To(BeNil(),
"intermediate chunks must not carry usage per OpenAI spec")
})
It("omits reasoning when it's empty regardless of streamed flag", func() {
It("returns no chunks when reasoning is empty and content was streamed", func() {
chunks := buildNoActionFinalChunks(
testID, testModel, testCreated,
true, false,
"", "", usage,
"", "",
)
Expect(chunks).To(HaveLen(1))
Expect(reasoningOf(chunks[0])).To(BeEmpty())
Expect(chunks).To(BeEmpty())
})
})
Describe("Content not streamed — role, then content+usage", func() {
Describe("Content not streamed — role, then content", func() {
It("emits role chunk then content chunk without reasoning when reasoning was streamed", func() {
chunks := buildNoActionFinalChunks(
testID, testModel, testCreated,
false, true,
"the answer", "already-streamed-reasoning", usage,
"the answer", "already-streamed-reasoning",
)
Expect(chunks).To(HaveLen(2))
@@ -666,14 +664,14 @@ var _ = Describe("buildNoActionFinalChunks", func() {
Expect(contentOf(chunks[1])).To(Equal("the answer"))
Expect(reasoningOf(chunks[1])).To(BeEmpty(),
"reasoning must not be re-emitted if it was streamed earlier")
Expect(chunks[1].Usage.TotalTokens).To(Equal(12))
Expect(chunks[1].Usage).To(BeNil())
})
It("emits role, then content+reasoning when reasoning was not streamed", func() {
chunks := buildNoActionFinalChunks(
testID, testModel, testCreated,
false, false,
"the answer", "autoparser final reasoning", usage,
"the answer", "autoparser final reasoning",
)
Expect(chunks).To(HaveLen(2))
@@ -681,14 +679,14 @@ var _ = Describe("buildNoActionFinalChunks", func() {
Expect(contentOf(chunks[1])).To(Equal("the answer"))
Expect(reasoningOf(chunks[1])).To(Equal("autoparser final reasoning"))
Expect(chunks[1].Usage.TotalTokens).To(Equal(12))
Expect(chunks[1].Usage).To(BeNil())
})
It("still emits content even when reasoning is empty", func() {
chunks := buildNoActionFinalChunks(
testID, testModel, testCreated,
false, false,
"just an answer", "", usage,
"just an answer", "",
)
Expect(chunks).To(HaveLen(2))
@@ -702,7 +700,7 @@ var _ = Describe("buildNoActionFinalChunks", func() {
chunks := buildNoActionFinalChunks(
testID, testModel, testCreated,
false, false,
"hi", "reasoning", usage,
"hi", "reasoning",
)
for i, ch := range chunks {
Expect(ch.ID).To(Equal(testID), "chunk[%d] ID", i)

View File

@@ -0,0 +1,179 @@
package openai
import (
"encoding/json"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/pkg/functions"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
// These tests pin LocalAI's streaming chunks to the OpenAI spec for the
// `usage` field. The regression that motivated them (issue #8546) was that
// LocalAI emitted `"usage":{...zeros...}` on every chunk, which made the
// official OpenAI Node SDK consumers (Continue, Kilo Code, Roo Code, Zed,
// IntelliJ Continue) drop every content chunk via the filter at
// continuedev/continue packages/openai-adapters/src/apis/OpenAI.ts:275-288.
//
// Per OpenAI's chat-completion streaming contract:
// - intermediate chunks MUST NOT carry a `usage` field
// - usage is only delivered when the request opts in via
// `stream_options.include_usage: true`, on a final extra chunk whose
// `choices` is an empty array.
var _ = Describe("streaming usage spec compliance", func() {
Describe("OpenAIResponse JSON shape", func() {
It("does not emit a 'usage' key when Usage is unset", func() {
// A typical intermediate token chunk: no Usage populated.
content := "hello"
resp := schema.OpenAIResponse{
ID: "req-1",
Created: 1,
Model: "m",
Object: "chat.completion.chunk",
Choices: []schema.Choice{{
Index: 0,
Delta: &schema.Message{Content: &content},
}},
}
data, err := json.Marshal(resp)
Expect(err).ToNot(HaveOccurred())
var raw map[string]any
Expect(json.Unmarshal(data, &raw)).To(Succeed())
_, present := raw["usage"]
Expect(present).To(BeFalse(),
"intermediate chunk must not include a 'usage' key; got: %s", string(data))
})
It("emits the usage object when Usage is explicitly set", func() {
usage := &schema.OpenAIUsage{PromptTokens: 11, CompletionTokens: 22, TotalTokens: 33}
resp := schema.OpenAIResponse{
ID: "req-1",
Created: 1,
Model: "m",
Object: "chat.completion.chunk",
Usage: usage,
}
data, err := json.Marshal(resp)
Expect(err).ToNot(HaveOccurred())
var raw map[string]any
Expect(json.Unmarshal(data, &raw)).To(Succeed())
u, ok := raw["usage"].(map[string]any)
Expect(ok).To(BeTrue(), "expected 'usage' object, got: %s", string(data))
Expect(u["prompt_tokens"]).To(BeNumerically("==", 11))
Expect(u["completion_tokens"]).To(BeNumerically("==", 22))
Expect(u["total_tokens"]).To(BeNumerically("==", 33))
})
})
Describe("buildNoActionFinalChunks", func() {
It("returns chunks with no Usage embedded", func() {
// Whatever the caller is doing, helpers must not bake usage
// into intermediate or final delta chunks. The usage trailer
// (when requested via include_usage) is emitted separately.
chunks := buildNoActionFinalChunks(
"req-1", "m", 1,
false, false,
"hi", "",
)
Expect(chunks).ToNot(BeEmpty())
for i, ch := range chunks {
Expect(ch.Usage).To(BeNil(),
"chunk[%d] must not carry Usage; got %+v", i, ch.Usage)
}
})
It("returns chunks with no Usage when only trailing reasoning needs delivery", func() {
chunks := buildNoActionFinalChunks(
"req-1", "m", 1,
true, false,
"", "autoparser late reasoning",
)
Expect(chunks).ToNot(BeEmpty())
for i, ch := range chunks {
Expect(ch.Usage).To(BeNil(),
"chunk[%d] must not carry Usage; got %+v", i, ch.Usage)
}
})
})
Describe("buildDeferredToolCallChunks", func() {
It("returns chunks with no Usage embedded", func() {
calls := []functions.FuncCallResults{{
Name: "do_thing", Arguments: `{"x":1}`,
}}
chunks := buildDeferredToolCallChunks(
"req-1", "m", 1, calls, 0,
false, "", false, "",
)
Expect(chunks).ToNot(BeEmpty())
for i, ch := range chunks {
Expect(ch.Usage).To(BeNil(),
"chunk[%d] must not carry Usage; got %+v", i, ch.Usage)
}
})
})
Describe("streamUsageTrailerJSON", func() {
It("produces JSON matching the OpenAI spec for the trailer chunk", func() {
// Trailing usage chunk shape (OpenAI streaming spec):
// {"id":"...","object":"chat.completion.chunk","created":...,
// "model":"...","choices":[],"usage":{...}}
usage := schema.OpenAIUsage{
PromptTokens: 18, CompletionTokens: 14, TotalTokens: 32,
}
data := streamUsageTrailerJSON("req-1", "m", 1, usage)
var raw map[string]any
Expect(json.Unmarshal(data, &raw)).To(Succeed(),
"trailer must be valid JSON, got: %s", string(data))
Expect(raw["id"]).To(Equal("req-1"))
Expect(raw["model"]).To(Equal("m"))
Expect(raw["object"]).To(Equal("chat.completion.chunk"))
Expect(raw["created"]).To(BeNumerically("==", 1))
// `choices` MUST be present as an empty array (not absent, not null).
rawChoices, present := raw["choices"]
Expect(present).To(BeTrue(), "choices key must be present, got: %s", string(data))
choicesArr, ok := rawChoices.([]any)
Expect(ok).To(BeTrue(), "choices must serialize as an array, got: %s", string(data))
Expect(choicesArr).To(BeEmpty(), "choices must be empty in usage trailer, got: %s", string(data))
// `usage` MUST be present and non-null with the populated counts.
u, ok := raw["usage"].(map[string]any)
Expect(ok).To(BeTrue(), "usage object must be present, got: %s", string(data))
Expect(u["prompt_tokens"]).To(BeNumerically("==", 18))
Expect(u["completion_tokens"]).To(BeNumerically("==", 14))
Expect(u["total_tokens"]).To(BeNumerically("==", 32))
})
})
Describe("OpenAIRequest.StreamOptions", func() {
It("parses stream_options.include_usage=true", func() {
body := []byte(`{
"model": "m",
"stream": true,
"stream_options": {"include_usage": true},
"messages": []
}`)
var req schema.OpenAIRequest
Expect(json.Unmarshal(body, &req)).To(Succeed())
Expect(req.StreamOptions).ToNot(BeNil())
Expect(req.StreamOptions.IncludeUsage).To(BeTrue())
})
It("defaults IncludeUsage to false when stream_options is absent", func() {
body := []byte(`{"model":"m","stream":true,"messages":[]}`)
var req schema.OpenAIRequest
Expect(json.Unmarshal(body, &req)).To(Succeed())
// Either a nil StreamOptions or one with IncludeUsage=false is acceptable.
if req.StreamOptions != nil {
Expect(req.StreamOptions.IncludeUsage).To(BeFalse())
}
})
})
})

View File

@@ -39,6 +39,10 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
}
// Usage rides on the struct for the consumer to track the
// running cumulative; the consumer strips it before marshalling
// so intermediate chunks stay OpenAI-spec compliant.
usageForChunk := usage
resp := schema.OpenAIResponse{
ID: id,
Created: created,
@@ -51,7 +55,7 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
},
},
Object: "text_completion",
Usage: usage,
Usage: &usageForChunk,
}
xlog.Debug("Sending goroutine", "text", s)
@@ -127,6 +131,8 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
ended <- process(id, predInput, input, config, ml, responses, extraUsage)
}()
var latestUsage *schema.OpenAIUsage
LOOP:
for {
select {
@@ -135,6 +141,14 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
xlog.Debug("No choices in the response, skipping")
continue
}
// Capture running cumulative usage for the optional trailer
// emitted after the final stop chunk when include_usage=true.
if ev.Usage != nil {
latestUsage = ev.Usage
}
// OpenAI streaming spec: intermediate chunks must NOT
// carry a `usage` field. Strip the tracking copy now.
ev.Usage = nil
respData, err := json.Marshal(ev)
if err != nil {
xlog.Debug("Failed to marshal response", "error", err)
@@ -194,8 +208,15 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
Object: "text_completion",
}
respData, _ := json.Marshal(resp)
fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData)
// Trailing usage chunk per OpenAI spec: emit only when the caller
// opted in via stream_options.include_usage.
if input.StreamOptions != nil && input.StreamOptions.IncludeUsage && latestUsage != nil {
trailer := streamUsageTrailerJSON(id, input.Model, created, *latestUsage)
_, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", trailer)
}
fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n")
c.Response().Flush()
return nil
@@ -247,7 +268,7 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: result,
Object: "text_completion",
Usage: usage,
Usage: &usage,
}
jsonResult, _ := json.Marshal(resp)

View File

@@ -92,7 +92,7 @@ func EditEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: result,
Object: "edit",
Usage: usage,
Usage: &usage,
}
jsonResult, _ := json.Marshal(resp)

View File

@@ -233,7 +233,7 @@ func ImageEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfi
ID: id,
Created: created,
Data: result,
Usage: schema.OpenAIUsage{
Usage: &schema.OpenAIUsage{
PromptTokens: 0,
CompletionTokens: 0,
TotalTokens: 0,

View File

@@ -258,7 +258,7 @@ func InpaintingEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app
Data: []schema.Item{{
URL: imgPath,
}},
Usage: schema.OpenAIUsage{
Usage: &schema.OpenAIUsage{
PromptTokens: 0,
CompletionTokens: 0,
TotalTokens: 0,

View File

@@ -255,7 +255,10 @@ export function useChat(initialModel = '') {
)
messages.push(...historyForApi, { role: 'user', content: messageContent })
const requestBody = { model, messages, stream: true }
// include_usage tells LocalAI to emit a trailing chunk with token totals;
// without it the spec-compliant server drops `usage` from the stream and
// the token-count badge would never populate.
const requestBody = { model, messages, stream: true, stream_options: { include_usage: true } }
if (temperature !== null && temperature !== undefined) requestBody.temperature = temperature
if (topP !== null && topP !== undefined) requestBody.top_p = topP
if (topK !== null && topK !== undefined) requestBody.top_k = topK

View File

@@ -1212,6 +1212,9 @@ async function promptGPT(systemPrompt, input) {
// Add stream parameter for both regular chat and MCP (MCP now supports SSE streaming)
requestBody.stream = true;
// include_usage tells LocalAI to emit a trailing chunk with token totals;
// the spec-compliant server otherwise drops `usage` from the stream.
requestBody.stream_options = { include_usage: true };
// Add generation parameters if they are set (null means use default)
if (activeChat.temperature !== null && activeChat.temperature !== undefined) {

View File

@@ -82,7 +82,21 @@ type OpenAIResponse struct {
Choices []Choice `json:"choices,omitempty"`
Data []Item `json:"data,omitempty"`
Usage OpenAIUsage `json:"usage"`
// Usage is intentionally a pointer with omitempty: per the OpenAI
// chat-completion streaming spec, intermediate chunks must not carry
// a `usage` field. Marshalling a value-typed usage would emit
// `"usage":{"prompt_tokens":0,...}` on every chunk and break
// OpenAI-SDK consumers that filter on a truthy `result.usage`
// (continuedev/continue, Kilo Code, Roo Code, etc.).
Usage *OpenAIUsage `json:"usage,omitempty"`
}
// StreamOptions mirrors OpenAI's `stream_options` request field. The only
// member currently honored is IncludeUsage; when true, the streaming
// chat-completion response emits a trailing chunk with `choices:[]` and a
// populated `usage` object.
type StreamOptions struct {
IncludeUsage bool `json:"include_usage,omitempty" yaml:"include_usage,omitempty"`
}
type Choice struct {
@@ -198,6 +212,9 @@ type OpenAIRequest struct {
Stream bool `json:"stream"`
// StreamOptions opts into OpenAI streaming extensions, e.g. include_usage.
StreamOptions *StreamOptions `json:"stream_options,omitempty" yaml:"stream_options,omitempty"`
// Image (not supported by OpenAI)
Quality string `json:"quality"`
Step int `json:"step"`