mirror of
https://github.com/mudler/LocalAI.git
synced 2026-03-31 13:15:51 -04:00
feat: pass-by metadata to predict options (#8795)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
0cf7c18177
commit
580517f9db
@@ -162,6 +162,7 @@ message PredictOptions {
|
||||
string ToolChoice = 49; // JSON string or object specifying tool choice behavior
|
||||
int32 Logprobs = 50; // Number of top logprobs to return (maps to OpenAI logprobs parameter)
|
||||
int32 TopLogprobs = 51; // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter)
|
||||
map<string, string> Metadata = 52; // Generic per-request metadata (e.g., enable_thinking)
|
||||
}
|
||||
|
||||
// The response message containing the result
|
||||
|
||||
@@ -1297,6 +1297,13 @@ public:
|
||||
body_json["min_p"] = data["min_p"];
|
||||
}
|
||||
|
||||
// Pass metadata fields to body_json
|
||||
const auto& metadata = request->metadata();
|
||||
auto et_it = metadata.find("enable_thinking");
|
||||
if (et_it != metadata.end()) {
|
||||
body_json["enable_thinking"] = (et_it->second == "true");
|
||||
}
|
||||
|
||||
// Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
|
||||
SRV_DBG("[CONVERSATION DEBUG] PredictStream: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());
|
||||
|
||||
@@ -2064,6 +2071,13 @@ public:
|
||||
body_json["min_p"] = data["min_p"];
|
||||
}
|
||||
|
||||
// Pass metadata fields to body_json
|
||||
const auto& predict_metadata = request->metadata();
|
||||
auto predict_et_it = predict_metadata.find("enable_thinking");
|
||||
if (predict_et_it != predict_metadata.end()) {
|
||||
body_json["enable_thinking"] = (predict_et_it->second == "true");
|
||||
}
|
||||
|
||||
// Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
|
||||
SRV_DBG("[CONVERSATION DEBUG] Predict: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ type TokenUsage struct {
|
||||
TimingTokenGeneration float64
|
||||
}
|
||||
|
||||
func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, tools string, toolChoice string, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (LLMResponse, error), error) {
|
||||
func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, tools string, toolChoice string, logprobs *int, topLogprobs *int, logitBias map[string]float64, metadata map[string]string) (func() (LLMResponse, error), error) {
|
||||
modelFile := c.Model
|
||||
|
||||
// Check if the modelFile exists, if it doesn't try to load it from the gallery
|
||||
@@ -85,6 +85,10 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
|
||||
// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
|
||||
fn := func() (LLMResponse, error) {
|
||||
opts := gRPCPredictOpts(*c, loader.ModelPath)
|
||||
// Merge request-level metadata (overrides config defaults)
|
||||
for k, v := range metadata {
|
||||
opts.Metadata[k] = v
|
||||
}
|
||||
opts.Prompt = s
|
||||
opts.Messages = protoMessages
|
||||
opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
|
||||
|
||||
@@ -254,6 +254,17 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
|
||||
TailFreeSamplingZ: float32(*c.TFZ),
|
||||
TypicalP: float32(*c.TypicalP),
|
||||
}
|
||||
|
||||
metadata := map[string]string{}
|
||||
if c.ReasoningConfig.DisableReasoning != nil {
|
||||
if *c.ReasoningConfig.DisableReasoning {
|
||||
metadata["enable_thinking"] = "false"
|
||||
} else {
|
||||
metadata["enable_thinking"] = "true"
|
||||
}
|
||||
}
|
||||
pbOpts.Metadata = metadata
|
||||
|
||||
// Logprobs and TopLogprobs are set by the caller if provided
|
||||
return pbOpts
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ func handleAnthropicNonStream(c echo.Context, id string, input *schema.Anthropic
|
||||
}
|
||||
|
||||
predFunc, err := backend.ModelInference(
|
||||
input.Context, predInput, openAIReq.Messages, images, nil, nil, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, nil, nil, nil)
|
||||
input.Context, predInput, openAIReq.Messages, images, nil, nil, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, nil, nil, nil, input.Metadata)
|
||||
if err != nil {
|
||||
xlog.Error("Anthropic model inference failed", "error", err)
|
||||
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))
|
||||
@@ -335,7 +335,7 @@ func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicReq
|
||||
}
|
||||
|
||||
predFunc, err := backend.ModelInference(
|
||||
input.Context, predInput, openAIMessages, images, nil, nil, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, nil, nil, nil)
|
||||
input.Context, predInput, openAIMessages, images, nil, nil, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, nil, nil, nil, input.Metadata)
|
||||
if err != nil {
|
||||
xlog.Error("Anthropic stream model inference failed", "error", err)
|
||||
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))
|
||||
|
||||
@@ -82,7 +82,7 @@ func ComputeChoices(
|
||||
|
||||
// get the model function to call for the result
|
||||
predFunc, err := backend.ModelInference(
|
||||
req.Context, predInput, req.Messages, images, videos, audios, loader, config, bcl, o, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias)
|
||||
req.Context, predInput, req.Messages, images, videos, audios, loader, config, bcl, o, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias, req.Metadata)
|
||||
if err != nil {
|
||||
return result, backend.TokenUsage{}, err
|
||||
}
|
||||
|
||||
@@ -237,7 +237,7 @@ func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, im
|
||||
toolChoiceJSON = string(b)
|
||||
}
|
||||
|
||||
return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, m.LLMConfig, m.confLoader, m.appConfig, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias)
|
||||
return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, m.LLMConfig, m.confLoader, m.appConfig, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias, nil)
|
||||
}
|
||||
|
||||
func (m *wrappedModel) TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) {
|
||||
|
||||
@@ -788,7 +788,7 @@ func handleBackgroundNonStream(ctx context.Context, store *ResponseStore, respon
|
||||
}
|
||||
|
||||
predFunc, err := backend.ModelInference(
|
||||
ctx, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, logprobs, input.TopLogprobs, input.LogitBias)
|
||||
ctx, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, logprobs, input.TopLogprobs, input.LogitBias, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("model inference failed: %w", err)
|
||||
}
|
||||
@@ -1010,7 +1010,7 @@ func handleBackgroundStream(ctx context.Context, store *ResponseStore, responseI
|
||||
}
|
||||
|
||||
predFunc, err := backend.ModelInference(
|
||||
ctx, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, streamLogprobs, input.TopLogprobs, input.LogitBias)
|
||||
ctx, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, streamLogprobs, input.TopLogprobs, input.LogitBias, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("model inference failed: %w", err)
|
||||
}
|
||||
@@ -1482,7 +1482,7 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
|
||||
}
|
||||
|
||||
predFunc, err := backend.ModelInference(
|
||||
input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, logprobs, input.TopLogprobs, input.LogitBias)
|
||||
input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, nil, toolsJSON, toolChoiceJSON, logprobs, input.TopLogprobs, input.LogitBias, nil)
|
||||
if err != nil {
|
||||
xlog.Error("Open Responses model inference failed", "error", err)
|
||||
return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("model inference failed: %v", err), "")
|
||||
@@ -2021,7 +2021,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
}
|
||||
|
||||
predFunc, err := backend.ModelInference(
|
||||
input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, streamLogprobs, input.TopLogprobs, input.LogitBias)
|
||||
input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, streamLogprobs, input.TopLogprobs, input.LogitBias, nil)
|
||||
if err != nil {
|
||||
xlog.Error("Open Responses stream model inference failed", "error", err)
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
@@ -2449,7 +2449,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
}
|
||||
|
||||
predFunc, err := backend.ModelInference(
|
||||
input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, mcpLogprobs, input.TopLogprobs, input.LogitBias)
|
||||
input.Context, predInput, openAIReq.Messages, images, videos, audios, ml, cfg, cl, appConfig, tokenCallback, toolsJSON, toolChoiceJSON, mcpLogprobs, input.TopLogprobs, input.LogitBias, nil)
|
||||
if err != nil {
|
||||
xlog.Error("Open Responses stream model inference failed", "error", err)
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
|
||||
@@ -474,6 +474,37 @@ reasoning:
|
||||
|
||||
**Note:** Custom tokens and tag pairs are checked before the default ones, giving them priority. This allows you to override default behavior or add support for new reasoning tag formats.
|
||||
|
||||
### Per-Request Override via Metadata
|
||||
|
||||
The `reasoning.disable` setting from model configuration can be overridden on a per-request basis using the `metadata` field in the OpenAI chat completion request. This allows you to enable or disable thinking for individual requests without changing the model configuration.
|
||||
|
||||
The `metadata` field accepts a `map[string]string` that is forwarded to the backend. The `enable_thinking` key controls thinking behavior:
|
||||
|
||||
```bash
|
||||
# Enable thinking for a single request (overrides model config)
|
||||
curl http://localhost:8080/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "qwen3",
|
||||
"messages": [{"role": "user", "content": "Explain quantum computing"}],
|
||||
"metadata": {"enable_thinking": "true"}
|
||||
}'
|
||||
|
||||
# Disable thinking for a single request (overrides model config)
|
||||
curl http://localhost:8080/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "qwen3",
|
||||
"messages": [{"role": "user", "content": "Hello"}],
|
||||
"metadata": {"enable_thinking": "false"}
|
||||
}'
|
||||
```
|
||||
|
||||
**Priority order:**
|
||||
1. Request-level `metadata.enable_thinking` (highest priority)
|
||||
2. Model config `reasoning.disable` (fallback)
|
||||
3. Auto-detected from model template (default)
|
||||
|
||||
## Pipeline Configuration
|
||||
|
||||
Define pipelines for audio-to-audio processing and the [Realtime API]({{%relref "features/openai-realtime" %}}):
|
||||
|
||||
@@ -110,7 +110,7 @@ curl http://localhost:8080/v1/messages \
|
||||
| `stream` | boolean | No | Enable streaming responses |
|
||||
| `tools` | array | No | Array of tool definitions for function calling |
|
||||
| `tool_choice` | string/object | No | Tool choice strategy: "auto", "any", "none", or specific tool |
|
||||
| `metadata` | object | No | Custom metadata to attach to the request |
|
||||
| `metadata` | object | No | Per-request metadata passed to the backend (e.g., `{"enable_thinking": "true"}`) |
|
||||
|
||||
#### Message Format
|
||||
|
||||
|
||||
Reference in New Issue
Block a user