diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 5b032ad4e..8502e9530 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -1922,25 +1922,27 @@ public: body_json["min_p"] = data["min_p"]; } - // Pass enable_thinking via chat_template_kwargs (where oaicompat_chat_params_parse reads it) + // Forward the chat_template_kwargs the Go layer resolved (model config + // chat_template_kwargs + per-request metadata: enable_thinking, + // reasoning_effort, preserve_thinking, ...). One generic merge replaces + // the previous per-key handling - new template levers need no C++ change. + // oaicompat_chat_params_parse reads these from body_json. const auto& metadata = request->metadata(); - auto et_it = metadata.find("enable_thinking"); - if (et_it != metadata.end()) { - if (!body_json.contains("chat_template_kwargs")) { - body_json["chat_template_kwargs"] = json::object(); + auto ctk_it = metadata.find("chat_template_kwargs"); + if (ctk_it != metadata.end() && !ctk_it->second.empty()) { + try { + json ctk = json::parse(ctk_it->second); + if (ctk.is_object()) { + if (!body_json.contains("chat_template_kwargs")) { + body_json["chat_template_kwargs"] = json::object(); + } + for (auto& el : ctk.items()) { + body_json["chat_template_kwargs"][el.key()] = el.value(); + } + } + } catch (const std::exception & e) { + SRV_WRN("failed to parse chat_template_kwargs metadata: %s\n", e.what()); } - body_json["chat_template_kwargs"]["enable_thinking"] = (et_it->second == "true"); - } - - // Pass reasoning_effort via chat_template_kwargs too: the lever - // jinja templates like gpt-oss (Harmony) / LFM2.5 read, distinct - // from enable_thinking which those templates ignore. - auto re_it = metadata.find("reasoning_effort"); - if (re_it != metadata.end() && !re_it->second.empty()) { - if (!body_json.contains("chat_template_kwargs")) { - body_json["chat_template_kwargs"] = json::object(); - } - body_json["chat_template_kwargs"]["reasoning_effort"] = re_it->second; } // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.) @@ -2756,25 +2758,26 @@ public: body_json["min_p"] = data["min_p"]; } - // Pass enable_thinking via chat_template_kwargs (where oaicompat_chat_params_parse reads it) + // Forward the chat_template_kwargs the Go layer resolved (model config + // chat_template_kwargs + per-request metadata: enable_thinking, + // reasoning_effort, preserve_thinking, ...). One generic merge replaces + // the previous per-key handling - new template levers need no C++ change. const auto& predict_metadata = request->metadata(); - auto predict_et_it = predict_metadata.find("enable_thinking"); - if (predict_et_it != predict_metadata.end()) { - if (!body_json.contains("chat_template_kwargs")) { - body_json["chat_template_kwargs"] = json::object(); + auto predict_ctk_it = predict_metadata.find("chat_template_kwargs"); + if (predict_ctk_it != predict_metadata.end() && !predict_ctk_it->second.empty()) { + try { + json ctk = json::parse(predict_ctk_it->second); + if (ctk.is_object()) { + if (!body_json.contains("chat_template_kwargs")) { + body_json["chat_template_kwargs"] = json::object(); + } + for (auto& el : ctk.items()) { + body_json["chat_template_kwargs"][el.key()] = el.value(); + } + } + } catch (const std::exception & e) { + SRV_WRN("failed to parse chat_template_kwargs metadata: %s\n", e.what()); } - body_json["chat_template_kwargs"]["enable_thinking"] = (predict_et_it->second == "true"); - } - - // Pass reasoning_effort via chat_template_kwargs too: the lever - // jinja templates like gpt-oss (Harmony) / LFM2.5 read, distinct - // from enable_thinking which those templates ignore. - auto predict_re_it = predict_metadata.find("reasoning_effort"); - if (predict_re_it != predict_metadata.end() && !predict_re_it->second.empty()) { - if (!body_json.contains("chat_template_kwargs")) { - body_json["chat_template_kwargs"] = json::object(); - } - body_json["chat_template_kwargs"]["reasoning_effort"] = predict_re_it->second; } // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.) diff --git a/core/backend/options.go b/core/backend/options.go index de8b4c44e..efe6c649f 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -368,6 +368,25 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions if c.ReasoningEffort != "" { metadata["reasoning_effort"] = c.ReasoningEffort } + // Client request metadata overrides the server-derived reasoning levers and + // reaches every backend through these standalone string keys (Python backends + // read them directly). The reserved blob key is server-owned and skipped. + for k, v := range c.RequestMetadata { + if k == "chat_template_kwargs" { + continue + } + metadata[k] = v + } + // Build the generic chat_template_kwargs blob (model config map + coerced + // metadata) for llama.cpp and write it LAST so a client cannot clobber it. + if blob := c.ResolveChatTemplateKwargs(metadata); len(blob) > 0 { + b, err := json.Marshal(blob) + if err != nil { + xlog.Warn("failed to marshal chat_template_kwargs", "error", err) + } else { + metadata["chat_template_kwargs"] = string(b) + } + } pbOpts.Metadata = metadata // Logprobs and TopLogprobs are set by the caller if provided diff --git a/core/backend/options_internal_test.go b/core/backend/options_internal_test.go index aa07b43bd..022d7b1d9 100644 --- a/core/backend/options_internal_test.go +++ b/core/backend/options_internal_test.go @@ -161,3 +161,67 @@ var _ = Describe("grpcModelOpts NBatch", func() { Expect(opts.ContextSize).To(BeEquivalentTo(4096), "n_batch must match the effective n_ctx the backend receives") }) }) + +// Guards the generic chat_template_kwargs forwarding: the model config map plus any +// per-request metadata overrides are merged, coerced, and serialised into the +// backend metadata blob that llama.cpp reads. Client metadata also overrides the +// server-derived standalone enable_thinking key (cross-backend consistency). +var _ = Describe("gRPCPredictOpts chat_template_kwargs metadata", func() { + baseCfg := func() config.ModelConfig { + cfg := config.ModelConfig{} + cfg.SetDefaults() + return cfg + } + + It("serialises the config map into the chat_template_kwargs blob", func() { + cfg := baseCfg() + cfg.ChatTemplateKwargs = map[string]any{"preserve_thinking": true} + opts := gRPCPredictOpts(cfg, "/tmp/models") + Expect(opts.Metadata).To(HaveKey("chat_template_kwargs")) + var blob map[string]any + Expect(json.Unmarshal([]byte(opts.Metadata["chat_template_kwargs"]), &blob)).To(Succeed()) + Expect(blob).To(HaveKeyWithValue("preserve_thinking", true)) + }) + + It("serialises reasoning_effort into the blob as a JSON string", func() { + cfg := baseCfg() + cfg.ReasoningEffort = "high" + opts := gRPCPredictOpts(cfg, "/tmp/models") + Expect(opts.Metadata).To(HaveKey("chat_template_kwargs")) + var blob map[string]any + Expect(json.Unmarshal([]byte(opts.Metadata["chat_template_kwargs"]), &blob)).To(Succeed()) + // reasoning_effort must remain a string in the blob (jinja templates that + // key on the level read a string), unlike enable_thinking which is a bool. + Expect(blob["reasoning_effort"]).To(BeAssignableToTypeOf("")) + Expect(blob).To(HaveKeyWithValue("reasoning_effort", "high")) + }) + + It("lets client request metadata override the server-derived enable_thinking key", func() { + cfg := baseCfg() + disable := true + cfg.ReasoningConfig = reasoning.Config{DisableReasoning: &disable} // server: enable_thinking=false + cfg.RequestMetadata = map[string]string{"enable_thinking": "true"} // client overrides + opts := gRPCPredictOpts(cfg, "/tmp/models") + // standalone key (Python backends) reflects the client override + Expect(opts.Metadata).To(HaveKeyWithValue("enable_thinking", "true")) + // blob (llama.cpp) reflects it too, as a real bool + var blob map[string]any + Expect(json.Unmarshal([]byte(opts.Metadata["chat_template_kwargs"]), &blob)).To(Succeed()) + Expect(blob).To(HaveKeyWithValue("enable_thinking", true)) + }) + + It("does not let a client clobber the blob via a chat_template_kwargs metadata key", func() { + cfg := baseCfg() + cfg.ChatTemplateKwargs = map[string]any{"preserve_thinking": true} + cfg.RequestMetadata = map[string]string{"chat_template_kwargs": "{\"preserve_thinking\": false}"} + opts := gRPCPredictOpts(cfg, "/tmp/models") + var blob map[string]any + Expect(json.Unmarshal([]byte(opts.Metadata["chat_template_kwargs"]), &blob)).To(Succeed()) + Expect(blob).To(HaveKeyWithValue("preserve_thinking", true)) + }) + + It("omits the blob when there is nothing to forward", func() { + opts := gRPCPredictOpts(baseCfg(), "/tmp/models") + Expect(opts.Metadata).ToNot(HaveKey("chat_template_kwargs")) + }) +}) diff --git a/core/config/chat_template_kwargs_test.go b/core/config/chat_template_kwargs_test.go new file mode 100644 index 000000000..4b32e695e --- /dev/null +++ b/core/config/chat_template_kwargs_test.go @@ -0,0 +1,48 @@ +package config_test + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/mudler/LocalAI/core/config" +) + +// ResolveChatTemplateKwargs layers the model config map (base) under the coerced +// backend metadata (server reasoning levers + client request overrides). +var _ = Describe("ModelConfig.ResolveChatTemplateKwargs", func() { + It("returns nil when nothing is set", func() { + c := &config.ModelConfig{} + Expect(c.ResolveChatTemplateKwargs(nil)).To(BeNil()) + }) + + It("returns the config map when no metadata is present", func() { + c := &config.ModelConfig{ChatTemplateKwargs: map[string]any{"preserve_thinking": true}} + Expect(c.ResolveChatTemplateKwargs(nil)).To(HaveKeyWithValue("preserve_thinking", true)) + }) + + It("lets metadata override the config map", func() { + c := &config.ModelConfig{ChatTemplateKwargs: map[string]any{"enable_thinking": true}} + got := c.ResolveChatTemplateKwargs(map[string]string{"enable_thinking": "false"}) + Expect(got).To(HaveKeyWithValue("enable_thinking", false)) + }) + + It("coerces true/false to bool and leaves other strings as-is", func() { + c := &config.ModelConfig{} + got := c.ResolveChatTemplateKwargs(map[string]string{ + "enable_thinking": "true", + "reasoning_effort": "high", + }) + Expect(got).To(HaveKeyWithValue("enable_thinking", true)) + Expect(got).To(HaveKeyWithValue("reasoning_effort", "high")) + }) + + It("skips the reserved chat_template_kwargs metadata key but keeps siblings", func() { + c := &config.ModelConfig{} + got := c.ResolveChatTemplateKwargs(map[string]string{ + "chat_template_kwargs": "{\"x\":1}", + "preserve_thinking": "true", + }) + Expect(got).ToNot(HaveKey("chat_template_kwargs")) + Expect(got).To(HaveKeyWithValue("preserve_thinking", true)) + }) +}) diff --git a/core/config/meta/registry_coverage_test.go b/core/config/meta/registry_coverage_test.go index a2cde3cf1..df2764d1f 100644 --- a/core/config/meta/registry_coverage_test.go +++ b/core/config/meta/registry_coverage_test.go @@ -112,6 +112,7 @@ var grandfatheredUnregistered = []string{ "agent.max_attempts", "agent.max_iterations", "cfg_scale", + "chat_template_kwargs", "concurrency_groups", "cutstrings", "debug", diff --git a/core/config/model_config.go b/core/config/model_config.go index 755280cc3..955a7a6ec 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -70,6 +70,19 @@ type ModelConfig struct { // (Harmony) or LFM2.5 — honor it; "none" also toggles enable_thinking off. ReasoningEffort string `yaml:"reasoning_effort,omitempty" json:"reasoning_effort,omitempty"` + // ChatTemplateKwargs are arbitrary key/values forwarded to the backend's jinja + // chat template via chat_template_kwargs (e.g. preserve_thinking: true). The + // server-derived reasoning levers (enable_thinking / reasoning_effort) and any + // per-request metadata overrides layer on top. See gRPCPredictOpts. + ChatTemplateKwargs map[string]any `yaml:"chat_template_kwargs,omitempty" json:"chat_template_kwargs,omitempty"` + + // RequestMetadata holds the raw client request `metadata` map for the current + // request. The request middleware stamps it; gRPCPredictOpts merges it into the + // backend gRPC metadata (overriding the server-derived enable_thinking / + // reasoning_effort) and folds it, coerced, into the chat_template_kwargs blob. + // Never persisted to YAML. + RequestMetadata map[string]string `yaml:"-" json:"-"` + FeatureFlag FeatureFlag `yaml:"feature_flags,omitempty" json:"feature_flags,omitempty"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early. // LLM configs (GPT4ALL, Llama.cpp, ...) LLMConfig `yaml:",inline" json:",inline"` @@ -551,6 +564,44 @@ func (c *ModelConfig) ApplyReasoningEffort(requestEffort string) { } } +// coerceChatTemplateKwarg coerces a request-metadata string value for use as a +// jinja chat_template_kwarg. "true"/"false" become real booleans (so a jinja +// `{% if preserve_thinking %}` reads false correctly, since any non-empty string +// is truthy); everything else stays a string. Numeric/typed per-request values are +// out of scope - set those in the model YAML chat_template_kwargs (YAML keeps the type). +func coerceChatTemplateKwarg(v string) any { + switch v { + case "true": + return true + case "false": + return false + default: + return v + } +} + +// ResolveChatTemplateKwargs builds the final chat_template_kwargs map forwarded to +// the backend, layered: the model config map (base) < the coerced backend metadata +// (server reasoning levers + client request overrides). `meta` is the already-merged +// backend metadata string map. The reserved "chat_template_kwargs" key is skipped so +// a client cannot smuggle a nested blob. Returns nil when there is nothing to forward. +func (c *ModelConfig) ResolveChatTemplateKwargs(meta map[string]string) map[string]any { + out := map[string]any{} + for k, v := range c.ChatTemplateKwargs { + out[k] = v + } + for k, v := range meta { + if k == "chat_template_kwargs" { + continue + } + out[k] = coerceChatTemplateKwarg(v) + } + if len(out) == 0 { + return nil + } + return out +} + // @Description PipelineStreaming toggles incremental delivery per realtime stage. type PipelineStreaming struct { LLM *bool `yaml:"llm,omitempty" json:"llm,omitempty"` diff --git a/core/http/app_test.go b/core/http/app_test.go index 735edaf1c..5917b034a 100644 --- a/core/http/app_test.go +++ b/core/http/app_test.go @@ -735,6 +735,18 @@ parameters: ` Expect(os.WriteFile(filepath.Join(modelDir, "mock-model.yaml"), []byte(mockModelYAML), 0644)).To(Succeed()) + // A second model carrying chat_template_kwargs so the REST->gRPC + // metadata-forwarding spec below can assert the model-YAML kwarg is + // merged with the per-request override. + mockCTKModelYAML := `name: mock-ctk-model +backend: mock-backend +parameters: + model: mock-model.bin +chat_template_kwargs: + preserve_thinking: true +` + Expect(os.WriteFile(filepath.Join(modelDir, "mock-ctk-model.yaml"), []byte(mockCTKModelYAML), 0644)).To(Succeed()) + systemState, err := system.GetSystemState( system.WithBackendPath(backendDir), system.WithModelPath(modelDir), @@ -809,6 +821,59 @@ parameters: Expect(string(dat)).To(ContainSubstring("mock-backend")) }) + It("forwards chat_template_kwargs and reasoning levers to gRPC PredictOptions.Metadata", func() { + // True HTTP->gRPC contract guard: drive a real /v1/chat/completions + // request and assert the exact metadata the REST layer forwarded to + // the backend. The mock-backend echoes PredictOptions.Metadata as JSON + // when it sees the ECHO_PREDICT_METADATA marker in the prompt, so this + // pins the request->gRPC mapping (model-YAML chat_template_kwargs + + // per-request metadata override + type coercion + standalone keys) + // without adding a new RPC. The marker rides in the user content and + // must survive into the backend prompt; if a future default chat + // template drops raw user content, move the marker to /v1/completions. + reqBody := map[string]any{ + "model": "mock-ctk-model", + "messages": []map[string]any{ + {"role": "user", "content": "ECHO_PREDICT_METADATA"}, + }, + // per-request override: overrides the standalone enable_thinking key + // and exercises coercion ("false" -> bool, "low" -> string) in the blob + "metadata": map[string]string{ + "enable_thinking": "false", + "reasoning_effort": "low", + }, + } + + var chatResp struct { + Choices []struct { + Message struct { + Content string `json:"content"` + } `json:"message"` + } `json:"choices"` + } + err := postRequestResponseJSON("http://127.0.0.1:9090/v1/chat/completions", &reqBody, &chatResp) + Expect(err).ToNot(HaveOccurred()) + Expect(chatResp.Choices).ToNot(BeEmpty()) + + // The assistant content is the JSON snapshot of PredictOptions.Metadata. + var meta map[string]string + Expect(json.Unmarshal([]byte(chatResp.Choices[0].Message.Content), &meta)).To(Succeed(), "echoed metadata: %s", chatResp.Choices[0].Message.Content) + + // Standalone keys reflect the per-request override (consumed by Python + // backends; consistent across backends). + Expect(meta).To(HaveKeyWithValue("enable_thinking", "false")) + Expect(meta).To(HaveKeyWithValue("reasoning_effort", "low")) + + // The chat_template_kwargs blob (consumed by llama.cpp) merges the + // model-YAML kwarg with the coerced request metadata override. + Expect(meta).To(HaveKey("chat_template_kwargs")) + var ctk map[string]any + Expect(json.Unmarshal([]byte(meta["chat_template_kwargs"]), &ctk)).To(Succeed(), "chat_template_kwargs blob: %s", meta["chat_template_kwargs"]) + Expect(ctk).To(HaveKeyWithValue("preserve_thinking", true)) // bool from model YAML + Expect(ctk).To(HaveKeyWithValue("enable_thinking", false)) // coerced "false" -> bool + Expect(ctk).To(HaveKeyWithValue("reasoning_effort", "low")) // non-bool stays string + }) + // Agent Jobs: HTTP API for task/job scheduling. The underlying AgentPool // service is exercised in core/services/agentpool/agent_jobs_test.go; // these specs cover the /api/agent/* HTTP plumbing on top. diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go index 591ec8b93..ff0d929ac 100644 --- a/core/http/middleware/request.go +++ b/core/http/middleware/request.go @@ -318,6 +318,13 @@ func mergeOpenAIRequestAndModelConfig(config *config.ModelConfig, input *schema. // (an operator's explicit disable wins over a request asking to think). config.ApplyReasoningEffort(input.ReasoningEffort) + // Forward the client's request metadata so chat-template kwargs set per-request + // (enable_thinking, reasoning_effort, preserve_thinking, ...) reach the backend + // and override the model's reasoning-config defaults. See gRPCPredictOpts. + if len(input.Metadata) > 0 { + config.RequestMetadata = input.Metadata + } + // Collapse the modern max_completion_tokens alias into the // legacy Maxtokens field so downstream code reads exactly one. // MaxCompletionTokens wins on conflict — it's the canonical diff --git a/core/http/middleware/request_test.go b/core/http/middleware/request_test.go index 04c30dee4..fe9fc926c 100644 --- a/core/http/middleware/request_test.go +++ b/core/http/middleware/request_test.go @@ -731,3 +731,60 @@ var _ = Describe("SetModelAndConfig reasoning_effort parsing (chat completions)" Expect(*(*captured2).ReasoningConfig.DisableReasoning).To(BeFalse()) }) }) + +var _ = Describe("SetModelAndConfig metadata passthrough (chat completions)", func() { + var modelDir string + + BeforeEach(func() { + var err error + modelDir, err = os.MkdirTemp("", "localai-test-models-*") + Expect(err).ToNot(HaveOccurred()) + }) + AfterEach(func() { _ = os.RemoveAll(modelDir) }) + + buildApp := func() (*echo.Echo, **config.ModelConfig) { + Expect(os.WriteFile(filepath.Join(modelDir, "test-model.yaml"), + []byte("name: test-model\nbackend: llama\n"), 0644)).To(Succeed()) + ss := &system.SystemState{Model: system.Model{ModelsPath: modelDir}} + appConfig := config.NewApplicationConfig() + appConfig.SystemState = ss + mcl := config.NewModelConfigLoader(modelDir) + ml := model.NewModelLoader(ss) + re := NewRequestExtractor(mcl, ml, appConfig) + + captured := new(*config.ModelConfig) + app := echo.New() + app.POST("/v1/chat/completions", + func(c echo.Context) error { + if cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig); ok { + *captured = cfg + } + return c.String(http.StatusOK, "ok") + }, + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }), + func(next echo.HandlerFunc) echo.HandlerFunc { + return func(c echo.Context) error { + if err := re.SetOpenAIRequest(c); err != nil { + return err + } + return next(c) + } + }, + ) + return app, captured + } + + It("stamps request metadata onto the config", func() { + app, captured := buildApp() + body := `{"model":"test-model","messages":[{"role":"user","content":"hi"}],` + + `"metadata":{"preserve_thinking":"true"}}` + req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + app.ServeHTTP(rec, req) + + Expect(rec.Code).To(Equal(http.StatusOK)) + Expect(*captured).ToNot(BeNil()) + Expect((*captured).RequestMetadata).To(HaveKeyWithValue("preserve_thinking", "true")) + }) +}) diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md index ec134d046..3d3b0e574 100644 --- a/docs/content/advanced/model-configuration.md +++ b/docs/content/advanced/model-configuration.md @@ -438,6 +438,36 @@ pipeline: reasoning_effort: none # overrides the LLM model's own reasoning_effort ``` +#### Custom `chat_template_kwargs` + +Some jinja chat templates expose extra variables beyond `enable_thinking` / +`reasoning_effort` (for example Qwen3's `preserve_thinking`). Set arbitrary key/values in +the model config and they are forwarded to the backend's `chat_template_kwargs` as-is, so +you don't need a dedicated server option per template variable: + +```yaml +name: qwen3 +chat_template_kwargs: + preserve_thinking: true +``` + +You can also override (or add) any of these per request through the OpenAI `metadata` +field on `/v1/chat/completions`. Values are strings; `"true"` / `"false"` are coerced to +booleans, anything else is passed through as a string: + +```json +{ + "model": "qwen3", + "messages": [{"role": "user", "content": "hi"}], + "metadata": { "preserve_thinking": "true", "enable_thinking": "false" } +} +``` + +Per-request `metadata` overrides the model config defaults and the reasoning-config levers, +and (for `enable_thinking` / `reasoning_effort`) takes effect across every backend that +reads them, not just llama.cpp. Typed (non-boolean) values are only supported through the +model YAML `chat_template_kwargs`, where YAML preserves the type. + ### Multimodal Backend Options | Option | Type | Default | Description | diff --git a/tests/e2e/mock-backend/main.go b/tests/e2e/mock-backend/main.go index 1a8e0418f..10fb25296 100644 --- a/tests/e2e/mock-backend/main.go +++ b/tests/e2e/mock-backend/main.go @@ -109,6 +109,23 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R }, nil } + // ECHO_PREDICT_METADATA lets tests assert exactly what the REST layer + // forwarded to the backend as gRPC PredictOptions.Metadata (e.g. the + // chat_template_kwargs blob and the standalone enable_thinking/reasoning_effort + // keys). The reply carries a JSON snapshot of in.Metadata so an HTTP-level + // test can pin the request -> gRPC mapping without a new RPC. + if strings.Contains(in.Prompt, "ECHO_PREDICT_METADATA") { + payload, err := json.Marshal(in.Metadata) + if err != nil { + return nil, fmt.Errorf("mock backend echo metadata error: %w", err) + } + return &pb.Reply{ + Message: payload, + Tokens: int32(len(in.Metadata)), + PromptTokens: 1, + }, nil + } + // ECHO_SERVED_MODEL returns the loaded model file path so router e2e // tests can verify which candidate actually served the request without // adding a new RPC. The router fans out to a single backend process per