Files
LocalAI/core/http/endpoints/openai/edit.go
LocalAI [bot] 8af963bdd9 fix(streaming): comply with OpenAI usage / stream_options spec (#9815)
* fix(streaming): comply with OpenAI usage / stream_options spec (#8546)

LocalAI emitted `"usage":{"prompt_tokens":0,...}` on every streamed
chunk because `OpenAIResponse.Usage` was a value type without
`omitempty`. The official OpenAI Node SDK and its consumers
(continuedev/continue, Kilo Code, Roo Code, Zed, IntelliJ Continue)
filter on a truthy `result.usage` to detect the trailing usage chunk;
LocalAI's zero-but-non-null usage on every intermediate chunk made
that filter swallow every content chunk and surface an empty chat
response while the server log looked successful.

Changes:

- `core/schema/openai.go`: `Usage *OpenAIUsage \`json:"usage,omitempty"\``
  so intermediate chunks no longer carry a `usage` key. Add
  `OpenAIRequest.StreamOptions` with `include_usage` to mirror OpenAI's
  request field.
- `core/http/endpoints/openai/chat.go` and `completion.go`: keep using
  the `Usage` struct field as an in-process channel for the running
  cumulative, but strip it before JSON marshalling. When the request
  set `stream_options.include_usage: true`, emit a dedicated trailing
  chunk with `"choices": []` and the populated usage (matching the
  OpenAI spec and llama.cpp's server behavior).
- `chat_emit.go`: new `streamUsageTrailerJSON` helper; drop the
  `usage` parameter from `buildNoActionFinalChunks` since chunks no
  longer carry usage.
- Update `image.go`, `inpainting.go`, `edit.go` to wrap their Usage
  values with `&` for the new pointer field.
- UI: send `stream_options:{include_usage:true}` from the React
  (`useChat.js`) and legacy (`static/chat.js`) chat clients so the
  token-count badge keeps populating now that the server is
  spec-compliant.

Tests:

- New `chat_stream_usage_test.go` pins the spec invariants:
  intermediate chunks have no `usage` key, the trailer JSON has
  `"choices":[]` and a populated `usage`, and `OpenAIRequest` parses
  `stream_options.include_usage`.
- Update `chat_emit_test.go` to reflect that finals no longer embed
  usage.

Verified against the live LocalAI instance: before the fix Continue's
filter logic swallowed 16/16 token chunks; with the new shape it
yields 4/5 and routes usage through the dedicated trailer chunk.

Fixes #8546

Assisted-by: Claude:opus-4.7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(streaming): silence errcheck on usage trailer Fprintf

The new spec-compliant `stream_options.include_usage` trailer writes
were flagged by errcheck since they're new code (golangci-lint runs
new-from-merge-base on master); the surrounding `fmt.Fprintf` data:
writes are grandfathered. Drop the return values explicitly to match
the linter's contract without adding a nolint shim.

Assisted-by: Claude:opus-4.7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-14 08:53:46 +02:00

105 lines
3.2 KiB
Go

package openai
import (
"encoding/json"
"time"
"github.com/labstack/echo/v4"
"github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/http/middleware"
"github.com/google/uuid"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/core/templates"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/xlog"
)
// EditEndpoint is the OpenAI edit API endpoint
// @Summary OpenAI edit endpoint
// @Tags inference
// @Param request body schema.OpenAIRequest true "query params"
// @Success 200 {object} schema.OpenAIResponse "Response"
// @Router /v1/edits [post]
func EditEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) echo.HandlerFunc {
return func(c echo.Context) error {
input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
if !ok || input.Model == "" {
return echo.ErrBadRequest
}
// Opt-in extra usage flag
extraUsage := c.Request().Header.Get("Extra-Usage") != ""
config, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
if !ok || config == nil {
return echo.ErrBadRequest
}
xlog.Debug("Edit Endpoint Input", "input", input)
xlog.Debug("Edit Endpoint Config", "config", *config)
var result []schema.Choice
totalTokenUsage := backend.TokenUsage{}
for _, i := range config.InputStrings {
templatedInput, err := evaluator.EvaluateTemplateForPrompt(templates.EditPromptTemplate, *config, templates.PromptTemplateData{
Input: i,
Instruction: input.Instruction,
SystemPrompt: config.SystemPrompt,
ReasoningEffort: input.ReasoningEffort,
Metadata: input.Metadata,
})
if err == nil {
i = templatedInput
xlog.Debug("Template found, input modified", "input", i)
}
r, tokenUsage, _, err := ComputeChoices(input, i, config, cl, appConfig, ml, func(s string, c *[]schema.Choice) {
*c = append(*c, schema.Choice{Text: s})
}, nil)
if err != nil {
return err
}
totalTokenUsage.Prompt += tokenUsage.Prompt
totalTokenUsage.Completion += tokenUsage.Completion
totalTokenUsage.TimingTokenGeneration += tokenUsage.TimingTokenGeneration
totalTokenUsage.TimingPromptProcessing += tokenUsage.TimingPromptProcessing
result = append(result, r...)
}
usage := schema.OpenAIUsage{
PromptTokens: totalTokenUsage.Prompt,
CompletionTokens: totalTokenUsage.Completion,
TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion,
}
if extraUsage {
usage.TimingTokenGeneration = totalTokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = totalTokenUsage.TimingPromptProcessing
}
id := uuid.New().String()
created := int(time.Now().Unix())
resp := &schema.OpenAIResponse{
ID: id,
Created: created,
Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: result,
Object: "edit",
Usage: &usage,
}
jsonResult, _ := json.Marshal(resp)
xlog.Debug("Response", "response", string(jsonResult))
// Return the prediction in the response body
return c.JSON(200, resp)
}
}