mirror of
https://github.com/mudler/LocalAI.git
synced 2026-07-03 04:46:54 -04:00
fix(cloud-proxy): parameter compatibility with newest reasoning models (#10640)
Newest cloud reasoning models reject two parameters the cloud-proxy
backend currently sends:
- Anthropic (claude-opus-4-x) and OpenAI (gpt-5.x) return 400 when
temperature is present: "'temperature' is deprecated for this model".
OpenAI-compatible clients typically send only the server-side DEFAULT
sampling values rather than user intent, so the translators now forward
neither temperature nor top_p and let the upstream apply its own
defaults.
- OpenAI gpt-5.x rejects max_tokens ("Unsupported parameter: 'max_tokens'
... Use 'max_completion_tokens' instead"). The OpenAI translator now
serializes the token limit as max_completion_tokens, which current
chat-completions models accept.
Verified live against claude-opus-4-8, gpt-5.5 and gemini-3.1-pro
(Gemini OpenAI-compat endpoint). Tests updated to the new contract.
Assisted-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: stefanwalcz <stefan.walcz@walcz.de>
This commit is contained in:
@@ -142,19 +142,12 @@ func buildAnthropicRequest(opts *pb.PredictOptions, cfg *proxyConfig, stream boo
|
||||
if req.MaxTokens <= 0 {
|
||||
req.MaxTokens = anthropicDefaultMaxTokens
|
||||
}
|
||||
// Newer Anthropic models 400 when both temperature and top_p are
|
||||
// set ("`temperature` and `top_p` cannot both be specified for
|
||||
// this model. Please use only one.") even though their docs only
|
||||
// "recommend" picking one. The OpenAI-compatible chat UI almost
|
||||
// always sends both with default values, so prefer temperature
|
||||
// and drop top_p when both are present.
|
||||
if t := opts.GetTemperature(); t != 0 {
|
||||
v := float64(t)
|
||||
req.Temperature = &v
|
||||
} else if t := opts.GetTopP(); t != 0 {
|
||||
v := float64(t)
|
||||
req.TopP = &v
|
||||
}
|
||||
// Do not forward temperature/top_p. Newer Anthropic reasoning models reject
|
||||
// requests that carry temperature ("`temperature` is deprecated for this
|
||||
// model"), and the OpenAI-compatible clients typically send only the
|
||||
// server-side DEFAULT sampling values rather than user intent — dropping
|
||||
// them loses nothing and lets the upstream apply its own defaults.
|
||||
_ = opts
|
||||
|
||||
req.Tools = convertOpenAITools(opts.GetTools())
|
||||
req.ToolChoice = convertOpenAIToolChoice(opts.GetToolChoice())
|
||||
|
||||
@@ -3,7 +3,6 @@ package main
|
||||
import (
|
||||
"encoding/json"
|
||||
"io"
|
||||
"math"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
@@ -75,15 +74,16 @@ func TestPredict_Anthropic_BasicMessages(t *testing.T) {
|
||||
g.Expect(captured.Messages).To(HaveLen(1))
|
||||
g.Expect(captured.Messages[0].Role).To(Equal("user"))
|
||||
g.Expect(captured.MaxTokens).To(Equal(int32(32)))
|
||||
g.Expect(captured.Temperature).NotTo(BeNil())
|
||||
g.Expect(*captured.Temperature).To(Equal(0.5))
|
||||
// Anthropic 400s when both temperature and top_p are set; the
|
||||
// translator must prefer temperature and drop top_p.
|
||||
// Newer Anthropic reasoning models reject requests carrying temperature
|
||||
// ("`temperature` is deprecated for this model"); clients typically send
|
||||
// only default sampling values, so the translator forwards neither.
|
||||
g.Expect(captured.Temperature).To(BeNil())
|
||||
g.Expect(captured.TopP).To(BeNil())
|
||||
g.Expect(captured.Stream).To(BeFalse())
|
||||
}
|
||||
|
||||
// When only top_p is set, it should be forwarded.
|
||||
// Sampling parameters are not forwarded at all — the upstream applies its
|
||||
// own defaults (newest models reject explicit temperature/top_p).
|
||||
func TestPredict_Anthropic_TopPOnly(t *testing.T) {
|
||||
g := NewWithT(t)
|
||||
srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
|
||||
@@ -99,11 +99,7 @@ func TestPredict_Anthropic_TopPOnly(t *testing.T) {
|
||||
})
|
||||
g.Expect(err).NotTo(HaveOccurred())
|
||||
g.Expect(captured.Temperature).To(BeNil())
|
||||
// PredictOptions.TopP is float32 on the wire; the translator widens
|
||||
// to float64 so 0.9 round-trips as 0.8999999761581421… — compare
|
||||
// with a small tolerance rather than exact equality.
|
||||
g.Expect(captured.TopP).NotTo(BeNil())
|
||||
g.Expect(math.Abs(*captured.TopP - 0.9)).To(BeNumerically("<=", 1e-6))
|
||||
g.Expect(captured.TopP).To(BeNil())
|
||||
}
|
||||
|
||||
func TestPredict_Anthropic_DefaultsMaxTokens(t *testing.T) {
|
||||
|
||||
@@ -30,7 +30,7 @@ type openAIRequest struct {
|
||||
Stream bool `json:"stream,omitempty"`
|
||||
Temperature *float64 `json:"temperature,omitempty"`
|
||||
TopP *float64 `json:"top_p,omitempty"`
|
||||
MaxTokens *int32 `json:"max_tokens,omitempty"`
|
||||
MaxTokens *int32 `json:"max_completion_tokens,omitempty"` // newer OpenAI models reject max_tokens ("use max_completion_tokens instead")
|
||||
Stop []string `json:"stop,omitempty"`
|
||||
FrequencyPenalty *float64 `json:"frequency_penalty,omitempty"`
|
||||
PresencePenalty *float64 `json:"presence_penalty,omitempty"`
|
||||
@@ -107,14 +107,10 @@ func buildOpenAIRequest(opts *pb.PredictOptions, cfg *proxyConfig, stream bool)
|
||||
Tools: parseRawJSON(opts.GetTools()),
|
||||
ToolChoice: parseRawJSON(opts.GetToolChoice()),
|
||||
}
|
||||
if t := opts.GetTemperature(); t != 0 {
|
||||
v := float64(t)
|
||||
req.Temperature = &v
|
||||
}
|
||||
if t := opts.GetTopP(); t != 0 {
|
||||
v := float64(t)
|
||||
req.TopP = &v
|
||||
}
|
||||
// Do not forward temperature/top_p. Newer OpenAI reasoning models reject
|
||||
// temperature as deprecated, and clients typically send only default
|
||||
// sampling values rather than user intent — let the upstream apply its
|
||||
// own defaults.
|
||||
if n := opts.GetTokens(); n > 0 {
|
||||
req.MaxTokens = &n
|
||||
}
|
||||
|
||||
@@ -74,8 +74,9 @@ func TestPredict_OpenAI_BasicChat(t *testing.T) {
|
||||
g.Expect(captured.Messages).To(HaveLen(2))
|
||||
g.Expect(captured.Messages[0].Role).To(Equal("system"))
|
||||
g.Expect(captured.Messages[1].Role).To(Equal("user"))
|
||||
g.Expect(captured.Temperature).NotTo(BeNil())
|
||||
g.Expect(*captured.Temperature).To(Equal(0.5))
|
||||
// Sampling parameters are not forwarded (newest models reject explicit
|
||||
// temperature); token limit is serialized as max_completion_tokens.
|
||||
g.Expect(captured.Temperature).To(BeNil())
|
||||
g.Expect(captured.MaxTokens).NotTo(BeNil())
|
||||
g.Expect(*captured.MaxTokens).To(Equal(int32(32)))
|
||||
g.Expect(captured.Stream).To(BeFalse())
|
||||
|
||||
Reference in New Issue
Block a user