diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index 6ae2faf81..0951a88cc 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -131,13 +131,19 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator delta.Reasoning = &reasoningDelta } + // Usage rides as a struct field for the consumer to track the + // running cumulative — it is stripped before JSON marshal so the + // wire chunk stays spec-compliant (no `usage` on intermediate + // chunks). The dedicated trailer chunk (when include_usage=true) + // carries the final totals. + usageForChunk := usage resp := schema.OpenAIResponse{ ID: id, Created: created, Model: req.Model, // we have to return what the user sent here, due to OpenAI spec. Choices: []schema.Choice{{Delta: delta, Index: 0, FinishReason: nil}}, Object: "chat.completion.chunk", - Usage: usage, + Usage: &usageForChunk, } responses <- resp @@ -164,7 +170,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator hasChatDeltaToolCalls := false hasChatDeltaContent := false - _, tokenUsage, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool { + _, _, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool { result += s // Track whether ChatDeltas from the C++ autoparser contain @@ -387,16 +393,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator switch { case noActionToRun: - usage := schema.OpenAIUsage{ - PromptTokens: tokenUsage.Prompt, - CompletionTokens: tokenUsage.Completion, - TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, - } - if extraUsage { - usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration - usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing - } - + // Token-cumulative usage is communicated to the streaming + // consumer via the per-token callback's chunk struct (stripped + // before wire marshal). The final usage trailer — when the + // caller opted in with stream_options.include_usage — is built + // by the outer streaming loop, not here. var result string if !sentInitialRole { var hqErr error @@ -409,7 +410,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator for _, chunk := range buildNoActionFinalChunks( id, req.Model, created, sentInitialRole, sentReasoning, - result, reasoning, usage, + result, reasoning, ) { responses <- chunk } @@ -724,7 +725,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator xlog.Debug("No choices in the response, skipping") continue } - usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it + // Capture the running cumulative usage from this chunk + // (when present) so the include_usage trailer can carry + // the final totals. Usage is stripped before marshal + // below so the wire chunk stays spec-compliant. + if ev.Usage != nil { + usage = ev.Usage + } if len(ev.Choices[0].Delta.ToolCalls) > 0 { toolsCalled = true // Collect and merge tool call deltas for MCP execution @@ -740,6 +747,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator collectedContent += *sp } } + // OpenAI streaming spec: intermediate chunks must NOT + // carry a `usage` field. Strip the tracking copy + // before marshalling — usage is delivered via the + // dedicated trailer chunk when include_usage=true. + ev.Usage = nil respData, err := json.Marshal(ev) if err != nil { xlog.Debug("Failed to marshal response", "error", err) @@ -888,6 +900,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator finishReason = FinishReasonFunctionCall } + // Final delta chunk: empty delta with finish_reason set. Per + // OpenAI streaming spec this chunk does NOT carry usage — + // the optional trailer (below) does, gated on include_usage. resp := &schema.OpenAIResponse{ ID: id, Created: created, @@ -899,11 +914,18 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator Delta: &schema.Message{}, }}, Object: "chat.completion.chunk", - Usage: *usage, } respData, _ := json.Marshal(resp) - fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData) + + // Trailing usage chunk per OpenAI spec: emit only when the + // caller opted in via stream_options.include_usage. Shape: + // {"choices":[],"usage":{...},"object":"chat.completion.chunk",...} + if input.StreamOptions != nil && input.StreamOptions.IncludeUsage && usage != nil { + trailer := streamUsageTrailerJSON(id, input.Model, created, *usage) + _, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", trailer) + } + fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n") c.Response().Flush() xlog.Debug("Stream ended") @@ -1263,7 +1285,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator Model: input.Model, // we have to return what the user sent here, due to OpenAI spec. Choices: result, Object: "chat.completion", - Usage: usage, + Usage: &usage, } respData, _ := json.Marshal(resp) xlog.Debug("Response", "response", string(respData)) diff --git a/core/http/endpoints/openai/chat_emit.go b/core/http/endpoints/openai/chat_emit.go index 0418099a6..ba182e77d 100644 --- a/core/http/endpoints/openai/chat_emit.go +++ b/core/http/endpoints/openai/chat_emit.go @@ -1,12 +1,45 @@ package openai import ( + "encoding/json" "fmt" "github.com/mudler/LocalAI/core/schema" "github.com/mudler/LocalAI/pkg/functions" ) +// streamUsageTrailerJSON returns the bytes of the OpenAI-spec trailing usage +// chunk emitted in streaming completions when the request opts in via +// `stream_options.include_usage: true`. The shape is: +// +// {"id":"...","object":"chat.completion.chunk","created":N, +// "model":"...","choices":[],"usage":{...}} +// +// `choices` is intentionally an empty array (not absent, not null) — that is +// what the OpenAI spec mandates, and what consumers like the official OpenAI +// SDK and Continue's openai-adapter look for to recognise this as the usage +// chunk rather than a content chunk. schema.OpenAIResponse has `omitempty` +// on Choices, so we cannot reuse it for the trailer. +func streamUsageTrailerJSON(id, model string, created int, usage schema.OpenAIUsage) []byte { + trailer := struct { + ID string `json:"id"` + Created int `json:"created"` + Model string `json:"model"` + Object string `json:"object"` + Choices []schema.Choice `json:"choices"` + Usage schema.OpenAIUsage `json:"usage"` + }{ + ID: id, + Created: created, + Model: model, + Object: "chat.completion.chunk", + Choices: []schema.Choice{}, + Usage: usage, + } + b, _ := json.Marshal(trailer) + return b +} + // hasRealCall reports whether functionResults contains at least one // entry whose Name is something other than the noAction sentinel. // Used by processTools to decide between the "answer the question" @@ -25,10 +58,10 @@ func hasRealCall(functionResults []functions.FuncCallResults, noAction string) b // pseudo-function or emitted no tool calls at all). // // When content was already streamed (contentAlreadyStreamed=true) the -// helper emits a single trailing usage chunk, optionally carrying -// reasoning that was produced but not streamed incrementally. When -// content was not streamed it emits a role chunk followed by a -// content+reasoning+usage chunk — the "send everything at once" fallback. +// helper emits a trailing reasoning chunk if any non-streamed reasoning +// remains, else nothing. When content was not streamed it emits a role +// chunk followed by a content (+reasoning) chunk — the "send everything +// at once" fallback. // // Reasoning re-emission is guarded by reasoningAlreadyStreamed, not by // probing the extractor's Go-side state: the C++ autoparser delivers @@ -36,6 +69,10 @@ func hasRealCall(functionResults []functions.FuncCallResults, noAction string) b // separate accumulator that extractor.Reasoning() does not expose. // Without this guard the callback would stream reasoning incrementally // and the final chunk would duplicate it. +// +// The returned chunks intentionally do NOT carry a `usage` field. The +// usage trailer is emitted separately by the streaming handler when +// `stream_options.include_usage` is true, per OpenAI spec. func buildNoActionFinalChunks( id, model string, created int, @@ -43,26 +80,26 @@ func buildNoActionFinalChunks( reasoningAlreadyStreamed bool, content string, reasoning string, - usage schema.OpenAIUsage, ) []schema.OpenAIResponse { var out []schema.OpenAIResponse if contentAlreadyStreamed { - delta := &schema.Message{} - if reasoning != "" && !reasoningAlreadyStreamed { - r := reasoning - delta.Reasoning = &r + if reasoning == "" || reasoningAlreadyStreamed { + return nil } + r := reasoning out = append(out, schema.OpenAIResponse{ ID: id, Created: created, Model: model, - Choices: []schema.Choice{{Delta: delta, Index: 0}}, - Object: "chat.completion.chunk", - Usage: usage, + Choices: []schema.Choice{{ + Delta: &schema.Message{Reasoning: &r}, + Index: 0, + }}, + Object: "chat.completion.chunk", }) return out } - // Content was not streamed — send role, then content (+reasoning) + usage. + // Content was not streamed — send role, then content (+reasoning). out = append(out, schema.OpenAIResponse{ ID: id, Created: created, Model: model, Choices: []schema.Choice{{ @@ -82,7 +119,6 @@ func buildNoActionFinalChunks( ID: id, Created: created, Model: model, Choices: []schema.Choice{{Delta: delta, Index: 0}}, Object: "chat.completion.chunk", - Usage: usage, }) return out } diff --git a/core/http/endpoints/openai/chat_emit_test.go b/core/http/endpoints/openai/chat_emit_test.go index 377d61c51..49432f0ac 100644 --- a/core/http/endpoints/openai/chat_emit_test.go +++ b/core/http/endpoints/openai/chat_emit_test.go @@ -609,54 +609,52 @@ var _ = Describe("buildNoActionFinalChunks", func() { testModel = "test-model" testCreated = 1700000000 ) - usage := schema.OpenAIUsage{PromptTokens: 5, CompletionTokens: 7, TotalTokens: 12} - Describe("Content streamed — trailing usage chunk", func() { - It("emits just one chunk with usage, no content, no reasoning when reasoning was streamed", func() { + Describe("Content streamed — trailing reasoning only", func() { + It("emits nothing when content and reasoning were already streamed", func() { + // Before the streaming-usage-spec fix this branch emitted a + // content-less chunk solely to carry `usage`. Per the OpenAI + // spec usage no longer rides on delta chunks; the dedicated + // trailer (when include_usage=true) carries it instead — so + // with nothing to deliver the helper returns no chunks. chunks := buildNoActionFinalChunks( testID, testModel, testCreated, true, true, - "", "already-streamed-reasoning", usage, + "", "already-streamed-reasoning", ) - - Expect(chunks).To(HaveLen(1)) - Expect(chunks[0].Usage.TotalTokens).To(Equal(12)) - Expect(contentOf(chunks[0])).To(BeEmpty()) - Expect(reasoningOf(chunks[0])).To(BeEmpty(), - "reasoning must not be re-emitted once it was streamed via the callback") + Expect(chunks).To(BeEmpty()) }) It("emits a trailing reasoning delivery when reasoning came only at end", func() { chunks := buildNoActionFinalChunks( testID, testModel, testCreated, true, false, - "", "autoparser final reasoning", usage, + "", "autoparser final reasoning", ) Expect(chunks).To(HaveLen(1)) Expect(reasoningOf(chunks[0])).To(Equal("autoparser final reasoning")) Expect(contentOf(chunks[0])).To(BeEmpty()) - Expect(chunks[0].Usage.TotalTokens).To(Equal(12)) + Expect(chunks[0].Usage).To(BeNil(), + "intermediate chunks must not carry usage per OpenAI spec") }) - It("omits reasoning when it's empty regardless of streamed flag", func() { + It("returns no chunks when reasoning is empty and content was streamed", func() { chunks := buildNoActionFinalChunks( testID, testModel, testCreated, true, false, - "", "", usage, + "", "", ) - - Expect(chunks).To(HaveLen(1)) - Expect(reasoningOf(chunks[0])).To(BeEmpty()) + Expect(chunks).To(BeEmpty()) }) }) - Describe("Content not streamed — role, then content+usage", func() { + Describe("Content not streamed — role, then content", func() { It("emits role chunk then content chunk without reasoning when reasoning was streamed", func() { chunks := buildNoActionFinalChunks( testID, testModel, testCreated, false, true, - "the answer", "already-streamed-reasoning", usage, + "the answer", "already-streamed-reasoning", ) Expect(chunks).To(HaveLen(2)) @@ -666,14 +664,14 @@ var _ = Describe("buildNoActionFinalChunks", func() { Expect(contentOf(chunks[1])).To(Equal("the answer")) Expect(reasoningOf(chunks[1])).To(BeEmpty(), "reasoning must not be re-emitted if it was streamed earlier") - Expect(chunks[1].Usage.TotalTokens).To(Equal(12)) + Expect(chunks[1].Usage).To(BeNil()) }) It("emits role, then content+reasoning when reasoning was not streamed", func() { chunks := buildNoActionFinalChunks( testID, testModel, testCreated, false, false, - "the answer", "autoparser final reasoning", usage, + "the answer", "autoparser final reasoning", ) Expect(chunks).To(HaveLen(2)) @@ -681,14 +679,14 @@ var _ = Describe("buildNoActionFinalChunks", func() { Expect(contentOf(chunks[1])).To(Equal("the answer")) Expect(reasoningOf(chunks[1])).To(Equal("autoparser final reasoning")) - Expect(chunks[1].Usage.TotalTokens).To(Equal(12)) + Expect(chunks[1].Usage).To(BeNil()) }) It("still emits content even when reasoning is empty", func() { chunks := buildNoActionFinalChunks( testID, testModel, testCreated, false, false, - "just an answer", "", usage, + "just an answer", "", ) Expect(chunks).To(HaveLen(2)) @@ -702,7 +700,7 @@ var _ = Describe("buildNoActionFinalChunks", func() { chunks := buildNoActionFinalChunks( testID, testModel, testCreated, false, false, - "hi", "reasoning", usage, + "hi", "reasoning", ) for i, ch := range chunks { Expect(ch.ID).To(Equal(testID), "chunk[%d] ID", i) diff --git a/core/http/endpoints/openai/chat_stream_usage_test.go b/core/http/endpoints/openai/chat_stream_usage_test.go new file mode 100644 index 000000000..2bba7b91c --- /dev/null +++ b/core/http/endpoints/openai/chat_stream_usage_test.go @@ -0,0 +1,179 @@ +package openai + +import ( + "encoding/json" + + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/functions" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// These tests pin LocalAI's streaming chunks to the OpenAI spec for the +// `usage` field. The regression that motivated them (issue #8546) was that +// LocalAI emitted `"usage":{...zeros...}` on every chunk, which made the +// official OpenAI Node SDK consumers (Continue, Kilo Code, Roo Code, Zed, +// IntelliJ Continue) drop every content chunk via the filter at +// continuedev/continue packages/openai-adapters/src/apis/OpenAI.ts:275-288. +// +// Per OpenAI's chat-completion streaming contract: +// - intermediate chunks MUST NOT carry a `usage` field +// - usage is only delivered when the request opts in via +// `stream_options.include_usage: true`, on a final extra chunk whose +// `choices` is an empty array. + +var _ = Describe("streaming usage spec compliance", func() { + Describe("OpenAIResponse JSON shape", func() { + It("does not emit a 'usage' key when Usage is unset", func() { + // A typical intermediate token chunk: no Usage populated. + content := "hello" + resp := schema.OpenAIResponse{ + ID: "req-1", + Created: 1, + Model: "m", + Object: "chat.completion.chunk", + Choices: []schema.Choice{{ + Index: 0, + Delta: &schema.Message{Content: &content}, + }}, + } + data, err := json.Marshal(resp) + Expect(err).ToNot(HaveOccurred()) + + var raw map[string]any + Expect(json.Unmarshal(data, &raw)).To(Succeed()) + _, present := raw["usage"] + Expect(present).To(BeFalse(), + "intermediate chunk must not include a 'usage' key; got: %s", string(data)) + }) + + It("emits the usage object when Usage is explicitly set", func() { + usage := &schema.OpenAIUsage{PromptTokens: 11, CompletionTokens: 22, TotalTokens: 33} + resp := schema.OpenAIResponse{ + ID: "req-1", + Created: 1, + Model: "m", + Object: "chat.completion.chunk", + Usage: usage, + } + data, err := json.Marshal(resp) + Expect(err).ToNot(HaveOccurred()) + + var raw map[string]any + Expect(json.Unmarshal(data, &raw)).To(Succeed()) + u, ok := raw["usage"].(map[string]any) + Expect(ok).To(BeTrue(), "expected 'usage' object, got: %s", string(data)) + Expect(u["prompt_tokens"]).To(BeNumerically("==", 11)) + Expect(u["completion_tokens"]).To(BeNumerically("==", 22)) + Expect(u["total_tokens"]).To(BeNumerically("==", 33)) + }) + }) + + Describe("buildNoActionFinalChunks", func() { + It("returns chunks with no Usage embedded", func() { + // Whatever the caller is doing, helpers must not bake usage + // into intermediate or final delta chunks. The usage trailer + // (when requested via include_usage) is emitted separately. + chunks := buildNoActionFinalChunks( + "req-1", "m", 1, + false, false, + "hi", "", + ) + Expect(chunks).ToNot(BeEmpty()) + for i, ch := range chunks { + Expect(ch.Usage).To(BeNil(), + "chunk[%d] must not carry Usage; got %+v", i, ch.Usage) + } + }) + + It("returns chunks with no Usage when only trailing reasoning needs delivery", func() { + chunks := buildNoActionFinalChunks( + "req-1", "m", 1, + true, false, + "", "autoparser late reasoning", + ) + Expect(chunks).ToNot(BeEmpty()) + for i, ch := range chunks { + Expect(ch.Usage).To(BeNil(), + "chunk[%d] must not carry Usage; got %+v", i, ch.Usage) + } + }) + }) + + Describe("buildDeferredToolCallChunks", func() { + It("returns chunks with no Usage embedded", func() { + calls := []functions.FuncCallResults{{ + Name: "do_thing", Arguments: `{"x":1}`, + }} + chunks := buildDeferredToolCallChunks( + "req-1", "m", 1, calls, 0, + false, "", false, "", + ) + Expect(chunks).ToNot(BeEmpty()) + for i, ch := range chunks { + Expect(ch.Usage).To(BeNil(), + "chunk[%d] must not carry Usage; got %+v", i, ch.Usage) + } + }) + }) + + Describe("streamUsageTrailerJSON", func() { + It("produces JSON matching the OpenAI spec for the trailer chunk", func() { + // Trailing usage chunk shape (OpenAI streaming spec): + // {"id":"...","object":"chat.completion.chunk","created":..., + // "model":"...","choices":[],"usage":{...}} + usage := schema.OpenAIUsage{ + PromptTokens: 18, CompletionTokens: 14, TotalTokens: 32, + } + data := streamUsageTrailerJSON("req-1", "m", 1, usage) + + var raw map[string]any + Expect(json.Unmarshal(data, &raw)).To(Succeed(), + "trailer must be valid JSON, got: %s", string(data)) + + Expect(raw["id"]).To(Equal("req-1")) + Expect(raw["model"]).To(Equal("m")) + Expect(raw["object"]).To(Equal("chat.completion.chunk")) + Expect(raw["created"]).To(BeNumerically("==", 1)) + + // `choices` MUST be present as an empty array (not absent, not null). + rawChoices, present := raw["choices"] + Expect(present).To(BeTrue(), "choices key must be present, got: %s", string(data)) + choicesArr, ok := rawChoices.([]any) + Expect(ok).To(BeTrue(), "choices must serialize as an array, got: %s", string(data)) + Expect(choicesArr).To(BeEmpty(), "choices must be empty in usage trailer, got: %s", string(data)) + + // `usage` MUST be present and non-null with the populated counts. + u, ok := raw["usage"].(map[string]any) + Expect(ok).To(BeTrue(), "usage object must be present, got: %s", string(data)) + Expect(u["prompt_tokens"]).To(BeNumerically("==", 18)) + Expect(u["completion_tokens"]).To(BeNumerically("==", 14)) + Expect(u["total_tokens"]).To(BeNumerically("==", 32)) + }) + }) + + Describe("OpenAIRequest.StreamOptions", func() { + It("parses stream_options.include_usage=true", func() { + body := []byte(`{ + "model": "m", + "stream": true, + "stream_options": {"include_usage": true}, + "messages": [] + }`) + var req schema.OpenAIRequest + Expect(json.Unmarshal(body, &req)).To(Succeed()) + Expect(req.StreamOptions).ToNot(BeNil()) + Expect(req.StreamOptions.IncludeUsage).To(BeTrue()) + }) + + It("defaults IncludeUsage to false when stream_options is absent", func() { + body := []byte(`{"model":"m","stream":true,"messages":[]}`) + var req schema.OpenAIRequest + Expect(json.Unmarshal(body, &req)).To(Succeed()) + // Either a nil StreamOptions or one with IncludeUsage=false is acceptable. + if req.StreamOptions != nil { + Expect(req.StreamOptions.IncludeUsage).To(BeFalse()) + } + }) + }) +}) diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go index 069bc33a6..f81e13e6a 100644 --- a/core/http/endpoints/openai/completion.go +++ b/core/http/endpoints/openai/completion.go @@ -39,6 +39,10 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing } + // Usage rides on the struct for the consumer to track the + // running cumulative; the consumer strips it before marshalling + // so intermediate chunks stay OpenAI-spec compliant. + usageForChunk := usage resp := schema.OpenAIResponse{ ID: id, Created: created, @@ -51,7 +55,7 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva }, }, Object: "text_completion", - Usage: usage, + Usage: &usageForChunk, } xlog.Debug("Sending goroutine", "text", s) @@ -127,6 +131,8 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva ended <- process(id, predInput, input, config, ml, responses, extraUsage) }() + var latestUsage *schema.OpenAIUsage + LOOP: for { select { @@ -135,6 +141,14 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva xlog.Debug("No choices in the response, skipping") continue } + // Capture running cumulative usage for the optional trailer + // emitted after the final stop chunk when include_usage=true. + if ev.Usage != nil { + latestUsage = ev.Usage + } + // OpenAI streaming spec: intermediate chunks must NOT + // carry a `usage` field. Strip the tracking copy now. + ev.Usage = nil respData, err := json.Marshal(ev) if err != nil { xlog.Debug("Failed to marshal response", "error", err) @@ -194,8 +208,15 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva Object: "text_completion", } respData, _ := json.Marshal(resp) - fmt.Fprintf(c.Response().Writer, "data: %s\n\n", respData) + + // Trailing usage chunk per OpenAI spec: emit only when the caller + // opted in via stream_options.include_usage. + if input.StreamOptions != nil && input.StreamOptions.IncludeUsage && latestUsage != nil { + trailer := streamUsageTrailerJSON(id, input.Model, created, *latestUsage) + _, _ = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", trailer) + } + fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n") c.Response().Flush() return nil @@ -247,7 +268,7 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva Model: input.Model, // we have to return what the user sent here, due to OpenAI spec. Choices: result, Object: "text_completion", - Usage: usage, + Usage: &usage, } jsonResult, _ := json.Marshal(resp) diff --git a/core/http/endpoints/openai/edit.go b/core/http/endpoints/openai/edit.go index c74e2d1f7..9a5198916 100644 --- a/core/http/endpoints/openai/edit.go +++ b/core/http/endpoints/openai/edit.go @@ -92,7 +92,7 @@ func EditEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator Model: input.Model, // we have to return what the user sent here, due to OpenAI spec. Choices: result, Object: "edit", - Usage: usage, + Usage: &usage, } jsonResult, _ := json.Marshal(resp) diff --git a/core/http/endpoints/openai/image.go b/core/http/endpoints/openai/image.go index 9d4fce246..47e53225f 100644 --- a/core/http/endpoints/openai/image.go +++ b/core/http/endpoints/openai/image.go @@ -233,7 +233,7 @@ func ImageEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfi ID: id, Created: created, Data: result, - Usage: schema.OpenAIUsage{ + Usage: &schema.OpenAIUsage{ PromptTokens: 0, CompletionTokens: 0, TotalTokens: 0, diff --git a/core/http/endpoints/openai/inpainting.go b/core/http/endpoints/openai/inpainting.go index a27ffea54..5cf37d34e 100644 --- a/core/http/endpoints/openai/inpainting.go +++ b/core/http/endpoints/openai/inpainting.go @@ -258,7 +258,7 @@ func InpaintingEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app Data: []schema.Item{{ URL: imgPath, }}, - Usage: schema.OpenAIUsage{ + Usage: &schema.OpenAIUsage{ PromptTokens: 0, CompletionTokens: 0, TotalTokens: 0, diff --git a/core/http/react-ui/src/hooks/useChat.js b/core/http/react-ui/src/hooks/useChat.js index 09e004d08..30538ed12 100644 --- a/core/http/react-ui/src/hooks/useChat.js +++ b/core/http/react-ui/src/hooks/useChat.js @@ -255,7 +255,10 @@ export function useChat(initialModel = '') { ) messages.push(...historyForApi, { role: 'user', content: messageContent }) - const requestBody = { model, messages, stream: true } + // include_usage tells LocalAI to emit a trailing chunk with token totals; + // without it the spec-compliant server drops `usage` from the stream and + // the token-count badge would never populate. + const requestBody = { model, messages, stream: true, stream_options: { include_usage: true } } if (temperature !== null && temperature !== undefined) requestBody.temperature = temperature if (topP !== null && topP !== undefined) requestBody.top_p = topP if (topK !== null && topK !== undefined) requestBody.top_k = topK diff --git a/core/http/static/chat.js b/core/http/static/chat.js index 34e9dc88c..59add0f09 100644 --- a/core/http/static/chat.js +++ b/core/http/static/chat.js @@ -1212,6 +1212,9 @@ async function promptGPT(systemPrompt, input) { // Add stream parameter for both regular chat and MCP (MCP now supports SSE streaming) requestBody.stream = true; + // include_usage tells LocalAI to emit a trailing chunk with token totals; + // the spec-compliant server otherwise drops `usage` from the stream. + requestBody.stream_options = { include_usage: true }; // Add generation parameters if they are set (null means use default) if (activeChat.temperature !== null && activeChat.temperature !== undefined) { diff --git a/core/schema/openai.go b/core/schema/openai.go index 6fcd78e93..83ab3a9fc 100644 --- a/core/schema/openai.go +++ b/core/schema/openai.go @@ -82,7 +82,21 @@ type OpenAIResponse struct { Choices []Choice `json:"choices,omitempty"` Data []Item `json:"data,omitempty"` - Usage OpenAIUsage `json:"usage"` + // Usage is intentionally a pointer with omitempty: per the OpenAI + // chat-completion streaming spec, intermediate chunks must not carry + // a `usage` field. Marshalling a value-typed usage would emit + // `"usage":{"prompt_tokens":0,...}` on every chunk and break + // OpenAI-SDK consumers that filter on a truthy `result.usage` + // (continuedev/continue, Kilo Code, Roo Code, etc.). + Usage *OpenAIUsage `json:"usage,omitempty"` +} + +// StreamOptions mirrors OpenAI's `stream_options` request field. The only +// member currently honored is IncludeUsage; when true, the streaming +// chat-completion response emits a trailing chunk with `choices:[]` and a +// populated `usage` object. +type StreamOptions struct { + IncludeUsage bool `json:"include_usage,omitempty" yaml:"include_usage,omitempty"` } type Choice struct { @@ -198,6 +212,9 @@ type OpenAIRequest struct { Stream bool `json:"stream"` + // StreamOptions opts into OpenAI streaming extensions, e.g. include_usage. + StreamOptions *StreamOptions `json:"stream_options,omitempty" yaml:"stream_options,omitempty"` + // Image (not supported by OpenAI) Quality string `json:"quality"` Step int `json:"step"`