From 580517f9dbbb04635b1cb7e41368154ff6c45dff Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 5 Mar 2026 22:50:10 +0100 Subject: [PATCH] feat: pass-by metadata to predict options (#8795) Signed-off-by: Ettore Di Giacinto --- backend/backend.proto | 1 + backend/cpp/llama-cpp/grpc-server.cpp | 14 +++++++++ core/backend/llm.go | 6 +++- core/backend/options.go | 11 +++++++ core/http/endpoints/anthropic/messages.go | 4 +-- core/http/endpoints/openai/inference.go | 2 +- core/http/endpoints/openai/realtime_model.go | 2 +- .../http/endpoints/openresponses/responses.go | 10 +++--- docs/content/advanced/model-configuration.md | 31 +++++++++++++++++++ docs/content/features/text-generation.md | 2 +- 10 files changed, 72 insertions(+), 11 deletions(-) diff --git a/backend/backend.proto b/backend/backend.proto index be12dfab7..6312036b2 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -162,6 +162,7 @@ message PredictOptions { string ToolChoice = 49; // JSON string or object specifying tool choice behavior int32 Logprobs = 50; // Number of top logprobs to return (maps to OpenAI logprobs parameter) int32 TopLogprobs = 51; // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter) + map Metadata = 52; // Generic per-request metadata (e.g., enable_thinking) } // The response message containing the result diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 7bdd123c2..a12d49a49 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -1297,6 +1297,13 @@ public: body_json["min_p"] = data["min_p"]; } + // Pass metadata fields to body_json + const auto& metadata = request->metadata(); + auto et_it = metadata.find("enable_thinking"); + if (et_it != metadata.end()) { + body_json["enable_thinking"] = (et_it->second == "true"); + } + // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.) SRV_DBG("[CONVERSATION DEBUG] PredictStream: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str()); @@ -2064,6 +2071,13 @@ public: body_json["min_p"] = data["min_p"]; } + // Pass metadata fields to body_json + const auto& predict_metadata = request->metadata(); + auto predict_et_it = predict_metadata.find("enable_thinking"); + if (predict_et_it != predict_metadata.end()) { + body_json["enable_thinking"] = (predict_et_it->second == "true"); + } + // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.) SRV_DBG("[CONVERSATION DEBUG] Predict: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str()); diff --git a/core/backend/llm.go b/core/backend/llm.go index 40b53e74c..d9bc4f02d 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -37,7 +37,7 @@ type TokenUsage struct { TimingTokenGeneration float64 } -func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, tools string, toolChoice string, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (LLMResponse, error), error) { +func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, tools string, toolChoice string, logprobs *int, topLogprobs *int, logitBias map[string]float64, metadata map[string]string) (func() (LLMResponse, error), error) { modelFile := c.Model // Check if the modelFile exists, if it doesn't try to load it from the gallery @@ -85,6 +85,10 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima // in GRPC, the backend is supposed to answer to 1 single token if stream is not supported fn := func() (LLMResponse, error) { opts := gRPCPredictOpts(*c, loader.ModelPath) + // Merge request-level metadata (overrides config defaults) + for k, v := range metadata { + opts.Metadata[k] = v + } opts.Prompt = s opts.Messages = protoMessages opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate diff --git a/core/backend/options.go b/core/backend/options.go index f3d5a4ccd..3268c9287 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -254,6 +254,17 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions TailFreeSamplingZ: float32(*c.TFZ), TypicalP: float32(*c.TypicalP), } + + metadata := map[string]string{} + if c.ReasoningConfig.DisableReasoning != nil { + if *c.ReasoningConfig.DisableReasoning { + metadata["enable_thinking"] = "false" + } else { + metadata["enable_thinking"] = "true" + } + } + pbOpts.Metadata = metadata + // Logprobs and TopLogprobs are set by the caller if provided return pbOpts } diff --git a/core/http/endpoints/anthropic/messages.go b/core/http/endpoints/anthropic/messages.go index f2acc524f..c0405499e 100644 --- a/core/http/endpoints/anthropic/messages.go +++ b/core/http/endpoints/anthropic/messages.go @@ -119,7 +119,7 @@ func handleAnthropicNonStream(c echo.Context, id string, input *schema.Anthropic } predFunc, err := backend.ModelInference( - input.Context, predInput, openAIReq.Messages, images, nil, nil, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, nil, nil, nil) + input.Context, predInput, openAIReq.Messages, images, nil, nil, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, nil, nil, nil, input.Metadata) if err != nil { xlog.Error("Anthropic model inference failed", "error", err) return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err)) @@ -335,7 +335,7 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq } predFunc, err := backend.ModelInference( - input.Context, predInput, openAIMessages, images, nil, nil, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, nil, nil, nil) + input.Context, predInput, openAIMessages, images, nil, nil, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, nil, nil, nil, input.Metadata) if err != nil { xlog.Error("Anthropic stream model inference failed", "error", err) return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err)) diff --git a/core/http/endpoints/openai/inference.go b/core/http/endpoints/openai/inference.go index 37b14c98b..0b99d9e13 100644 --- a/core/http/endpoints/openai/inference.go +++ b/core/http/endpoints/openai/inference.go @@ -82,7 +82,7 @@ func ComputeChoices( // get the model function to call for the result predFunc, err := backend.ModelInference( - req.Context, predInput, req.Messages, images, videos, audios, loader, config, bcl, o, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias) + req.Context, predInput, req.Messages, images, videos, audios, loader, config, bcl, o, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias, req.Metadata) if err != nil { return result, backend.TokenUsage{}, err } diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go index 2b757e566..224135b30 100644 --- a/core/http/endpoints/openai/realtime_model.go +++ b/core/http/endpoints/openai/realtime_model.go @@ -237,7 +237,7 @@ func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, im toolChoiceJSON = string(b) } - return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, m.LLMConfig, m.confLoader, m.appConfig, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias) + return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, m.LLMConfig, m.confLoader, m.appConfig, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias, nil) } func (m *wrappedModel) TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) { diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go index 2a939c730..540f29a51 100644 --- a/core/http/endpoints/openresponses/responses.go +++ b/core/http/endpoints/openresponses/responses.go @@ -788,7 +788,7 @@ func handleBackgroundNonStream(ctx context.Context, store *ResponseStore, respon } predFunc, err := backend.ModelInference( - ctx, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, logprobs, input.TopLogprobs, input.LogitBias) + ctx, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, logprobs, input.TopLogprobs, input.LogitBias, nil) if err != nil { return nil, fmt.Errorf("model inference failed: %w", err) } @@ -1010,7 +1010,7 @@ func handleBackgroundStream(ctx context.Context, store *ResponseStore, responseI } predFunc, err := backend.ModelInference( - ctx, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, streamLogprobs, input.TopLogprobs, input.LogitBias) + ctx, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, streamLogprobs, input.TopLogprobs, input.LogitBias, nil) if err != nil { return nil, fmt.Errorf("model inference failed: %w", err) } @@ -1482,7 +1482,7 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i } predFunc, err := backend.ModelInference( - input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, logprobs, input.TopLogprobs, input.LogitBias) + input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, logprobs, input.TopLogprobs, input.LogitBias, nil) if err != nil { xlog.Error("Open Responses model inference failed", "error", err) return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("model inference failed: %v", err), "") @@ -2021,7 +2021,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6 } predFunc, err := backend.ModelInference( - input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, streamLogprobs, input.TopLogprobs, input.LogitBias) + input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, streamLogprobs, input.TopLogprobs, input.LogitBias, nil) if err != nil { xlog.Error("Open Responses stream model inference failed", "error", err) sendSSEEvent(c, &schema.ORStreamEvent{ @@ -2449,7 +2449,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6 } predFunc, err := backend.ModelInference( - input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, mcpLogprobs, input.TopLogprobs, input.LogitBias) + input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, mcpLogprobs, input.TopLogprobs, input.LogitBias, nil) if err != nil { xlog.Error("Open Responses stream model inference failed", "error", err) sendSSEEvent(c, &schema.ORStreamEvent{ diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md index 4dbf70d88..1fd49328f 100644 --- a/docs/content/advanced/model-configuration.md +++ b/docs/content/advanced/model-configuration.md @@ -474,6 +474,37 @@ reasoning: **Note:** Custom tokens and tag pairs are checked before the default ones, giving them priority. This allows you to override default behavior or add support for new reasoning tag formats. +### Per-Request Override via Metadata + +The `reasoning.disable` setting from model configuration can be overridden on a per-request basis using the `metadata` field in the OpenAI chat completion request. This allows you to enable or disable thinking for individual requests without changing the model configuration. + +The `metadata` field accepts a `map[string]string` that is forwarded to the backend. The `enable_thinking` key controls thinking behavior: + +```bash +# Enable thinking for a single request (overrides model config) +curl http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "qwen3", + "messages": [{"role": "user", "content": "Explain quantum computing"}], + "metadata": {"enable_thinking": "true"} + }' + +# Disable thinking for a single request (overrides model config) +curl http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "qwen3", + "messages": [{"role": "user", "content": "Hello"}], + "metadata": {"enable_thinking": "false"} + }' +``` + +**Priority order:** +1. Request-level `metadata.enable_thinking` (highest priority) +2. Model config `reasoning.disable` (fallback) +3. Auto-detected from model template (default) + ## Pipeline Configuration Define pipelines for audio-to-audio processing and the [Realtime API]({{%relref "features/openai-realtime" %}}): diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md index b83d01b2c..00f2a3156 100644 --- a/docs/content/features/text-generation.md +++ b/docs/content/features/text-generation.md @@ -110,7 +110,7 @@ curl http://localhost:8080/v1/messages \ | `stream` | boolean | No | Enable streaming responses | | `tools` | array | No | Array of tool definitions for function calling | | `tool_choice` | string/object | No | Tool choice strategy: "auto", "any", "none", or specific tool | -| `metadata` | object | No | Custom metadata to attach to the request | +| `metadata` | object | No | Per-request metadata passed to the backend (e.g., `{"enable_thinking": "true"}`) | #### Message Format