feat: pass-by metadata to predict options (#8795)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-03-05 22:50:10 +01:00
committed by GitHub
parent 0cf7c18177
commit 580517f9db
10 changed files with 72 additions and 11 deletions

View File

@@ -162,6 +162,7 @@ message PredictOptions {
string ToolChoice = 49; // JSON string or object specifying tool choice behavior
int32 Logprobs = 50; // Number of top logprobs to return (maps to OpenAI logprobs parameter)
int32 TopLogprobs = 51; // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter)
map<string, string> Metadata = 52; // Generic per-request metadata (e.g., enable_thinking)
}
// The response message containing the result

View File

@@ -1297,6 +1297,13 @@ public:
body_json["min_p"] = data["min_p"];
}
// Pass metadata fields to body_json
const auto& metadata = request->metadata();
auto et_it = metadata.find("enable_thinking");
if (et_it != metadata.end()) {
body_json["enable_thinking"] = (et_it->second == "true");
}
// Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
SRV_DBG("[CONVERSATION DEBUG] PredictStream: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());
@@ -2064,6 +2071,13 @@ public:
body_json["min_p"] = data["min_p"];
}
// Pass metadata fields to body_json
const auto& predict_metadata = request->metadata();
auto predict_et_it = predict_metadata.find("enable_thinking");
if (predict_et_it != predict_metadata.end()) {
body_json["enable_thinking"] = (predict_et_it->second == "true");
}
// Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
SRV_DBG("[CONVERSATION DEBUG] Predict: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());

View File

@@ -37,7 +37,7 @@ type TokenUsage struct {
TimingTokenGeneration float64
}
func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, tools string, toolChoice string, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (LLMResponse, error), error) {
func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, tools string, toolChoice string, logprobs *int, topLogprobs *int, logitBias map[string]float64, metadata map[string]string) (func() (LLMResponse, error), error) {
modelFile := c.Model
// Check if the modelFile exists, if it doesn't try to load it from the gallery
@@ -85,6 +85,10 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
fn := func() (LLMResponse, error) {
opts := gRPCPredictOpts(*c, loader.ModelPath)
// Merge request-level metadata (overrides config defaults)
for k, v := range metadata {
opts.Metadata[k] = v
}
opts.Prompt = s
opts.Messages = protoMessages
opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate

View File

@@ -254,6 +254,17 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
TailFreeSamplingZ: float32(*c.TFZ),
TypicalP: float32(*c.TypicalP),
}
metadata := map[string]string{}
if c.ReasoningConfig.DisableReasoning != nil {
if *c.ReasoningConfig.DisableReasoning {
metadata["enable_thinking"] = "false"
} else {
metadata["enable_thinking"] = "true"
}
}
pbOpts.Metadata = metadata
// Logprobs and TopLogprobs are set by the caller if provided
return pbOpts
}

View File

@@ -119,7 +119,7 @@ func handleAnthropicNonStream(c echo.Context, id string, input *schema.Anthropic
}
predFunc, err := backend.ModelInference(
input.Context, predInput, openAIReq.Messages, images, nil, nil, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, nil, nil, nil)
input.Context, predInput, openAIReq.Messages, images, nil, nil, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, nil, nil, nil, input.Metadata)
if err != nil {
xlog.Error("Anthropic model inference failed", "error", err)
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))
@@ -335,7 +335,7 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
}
predFunc, err := backend.ModelInference(
input.Context, predInput, openAIMessages, images, nil, nil, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, nil, nil, nil)
input.Context, predInput, openAIMessages, images, nil, nil, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, nil, nil, nil, input.Metadata)
if err != nil {
xlog.Error("Anthropic stream model inference failed", "error", err)
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))

View File

@@ -82,7 +82,7 @@ func ComputeChoices(
// get the model function to call for the result
predFunc, err := backend.ModelInference(
req.Context, predInput, req.Messages, images, videos, audios, loader, config, bcl, o, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias)
req.Context, predInput, req.Messages, images, videos, audios, loader, config, bcl, o, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias, req.Metadata)
if err != nil {
return result, backend.TokenUsage{}, err
}

View File

@@ -237,7 +237,7 @@ func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, im
toolChoiceJSON = string(b)
}
return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, m.LLMConfig, m.confLoader, m.appConfig, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias)
return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, m.LLMConfig, m.confLoader, m.appConfig, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias, nil)
}
func (m *wrappedModel) TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) {

View File

@@ -788,7 +788,7 @@ func handleBackgroundNonStream(ctx context.Context, store *ResponseStore, respon
}
predFunc, err := backend.ModelInference(
ctx, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, logprobs, input.TopLogprobs, input.LogitBias)
ctx, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, logprobs, input.TopLogprobs, input.LogitBias, nil)
if err != nil {
return nil, fmt.Errorf("model inference failed: %w", err)
}
@@ -1010,7 +1010,7 @@ func handleBackgroundStream(ctx context.Context, store *ResponseStore, responseI
}
predFunc, err := backend.ModelInference(
ctx, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, streamLogprobs, input.TopLogprobs, input.LogitBias)
ctx, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, streamLogprobs, input.TopLogprobs, input.LogitBias, nil)
if err != nil {
return nil, fmt.Errorf("model inference failed: %w", err)
}
@@ -1482,7 +1482,7 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
}
predFunc, err := backend.ModelInference(
input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, logprobs, input.TopLogprobs, input.LogitBias)
input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, logprobs, input.TopLogprobs, input.LogitBias, nil)
if err != nil {
xlog.Error("Open Responses model inference failed", "error", err)
return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("model inference failed: %v", err), "")
@@ -2021,7 +2021,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
}
predFunc, err := backend.ModelInference(
input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, streamLogprobs, input.TopLogprobs, input.LogitBias)
input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, streamLogprobs, input.TopLogprobs, input.LogitBias, nil)
if err != nil {
xlog.Error("Open Responses stream model inference failed", "error", err)
sendSSEEvent(c, &schema.ORStreamEvent{
@@ -2449,7 +2449,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
}
predFunc, err := backend.ModelInference(
input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, mcpLogprobs, input.TopLogprobs, input.LogitBias)
input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, mcpLogprobs, input.TopLogprobs, input.LogitBias, nil)
if err != nil {
xlog.Error("Open Responses stream model inference failed", "error", err)
sendSSEEvent(c, &schema.ORStreamEvent{

View File

@@ -474,6 +474,37 @@ reasoning:
**Note:** Custom tokens and tag pairs are checked before the default ones, giving them priority. This allows you to override default behavior or add support for new reasoning tag formats.
### Per-Request Override via Metadata
The `reasoning.disable` setting from model configuration can be overridden on a per-request basis using the `metadata` field in the OpenAI chat completion request. This allows you to enable or disable thinking for individual requests without changing the model configuration.
The `metadata` field accepts a `map[string]string` that is forwarded to the backend. The `enable_thinking` key controls thinking behavior:
```bash
# Enable thinking for a single request (overrides model config)
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen3",
"messages": [{"role": "user", "content": "Explain quantum computing"}],
"metadata": {"enable_thinking": "true"}
}'
# Disable thinking for a single request (overrides model config)
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen3",
"messages": [{"role": "user", "content": "Hello"}],
"metadata": {"enable_thinking": "false"}
}'
```
**Priority order:**
1. Request-level `metadata.enable_thinking` (highest priority)
2. Model config `reasoning.disable` (fallback)
3. Auto-detected from model template (default)
## Pipeline Configuration
Define pipelines for audio-to-audio processing and the [Realtime API]({{%relref "features/openai-realtime" %}}):

View File

@@ -110,7 +110,7 @@ curl http://localhost:8080/v1/messages \
| `stream` | boolean | No | Enable streaming responses |
| `tools` | array | No | Array of tool definitions for function calling |
| `tool_choice` | string/object | No | Tool choice strategy: "auto", "any", "none", or specific tool |
| `metadata` | object | No | Custom metadata to attach to the request |
| `metadata` | object | No | Per-request metadata passed to the backend (e.g., `{"enable_thinking": "true"}`) |
#### Message Format