Additional thinking tags

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
chore: refactorings
2026-02-05 04:02:45 -05:00 · 2026-01-20 12:02:35 +01:00 · 2026-01-20 11:48:00 +01:00 · 2026-01-20 11:40:29 +01:00 · 2026-01-20 09:25:24 +01:00 · 2026-01-19 22:50:47 +01:00
20 changed files with 2677 additions and 395 deletions
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=2fbde785bc106ae1c4102b0e82b9b41d9c466579
+LLAMA_VERSION?=959ecf7f234dc0bc0cd6829b25cb0ee1481aa78a
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=9565c7f6bd5fcff124c589147b2621244f2c4aa1
+STABLEDIFFUSION_GGML_VERSION?=a48b4a3ade9972faf0adcad47e51c6fc03f0e46d

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -10,6 +10,7 @@ import (
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/functions"
+	"github.com/mudler/LocalAI/pkg/reasoning"
 	"github.com/mudler/cogito"
 	"gopkg.in/yaml.v3"
 )
@@ -51,6 +52,7 @@ type ModelConfig struct {
 	ResponseFormatMap                          map[string]interface{} `yaml:"-" json:"-"`

 	FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"`
+	ReasoningConfig reasoning.ReasoningConfig `yaml:"reasoning,omitempty" json:"reasoning,omitempty"`

 	FeatureFlag FeatureFlag `yaml:"feature_flags,omitempty" json:"feature_flags,omitempty"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -13,6 +13,7 @@ import (
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
+	"github.com/mudler/LocalAI/pkg/reasoning"

 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -43,10 +44,19 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		lastEmittedReasoning := ""
 		lastEmittedCleanedContent := ""

+		// Configure reasoning extraction options
+		// Auto-detect if prompt ends with thinking tag
+		// or use explicit config setting
+		thinkingForcedOpen := config.ReasoningConfig.ThinkingForcedOpen || reasoning.DetectThinkingForcedOpen(s)
+
 		_, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
 			accumulatedContent += s
 			// Extract reasoning from accumulated content
-			currentReasoning, cleanedContent := functions.ExtractReasoning(accumulatedContent)
+			opts := []reasoning.Option{}
+			if thinkingForcedOpen {
+				opts = append(opts, reasoning.WithThinkingForcedOpen())
+			}
+			currentReasoning, cleanedContent := reasoning.Extract(accumulatedContent, opts...)

 			// Calculate new reasoning delta (what we haven't emitted yet)
 			var reasoningDelta *string
@@ -230,7 +240,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			return err
 		}
 		// Extract reasoning before processing tool calls
-		reasoning, cleanedResult := functions.ExtractReasoning(result)
+		// Auto-detect if prompt ends with thinking tag or use explicit config
+		toolsThinkingForcedOpen := config.ReasoningConfig.ThinkingForcedOpen || reasoning.DetectThinkingForcedOpen(prompt)
+		opts := []reasoning.Option{}
+		if toolsThinkingForcedOpen {
+			opts = append(opts, reasoning.WithThinkingForcedOpen())
+		}
+		extractedReasoning, cleanedResult := reasoning.Extract(result, opts...)
 		result = cleanedResult

 		textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
@@ -266,8 +282,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			}

 			var deltaReasoning *string
-			if reasoning != "" {
-				deltaReasoning = &reasoning
+			if extractedReasoning != "" {
+				deltaReasoning = &extractedReasoning
 			}
 			delta := &schema.Message{Content: &result}
 			if deltaReasoning != nil {
@@ -618,17 +634,24 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		// no streaming mode
 		default:

+			// Auto-detect if prompt ends with thinking tag for non-streaming mode
+			nonStreamThinkingForcedOpen := config.ReasoningConfig.ThinkingForcedOpen || reasoning.DetectThinkingForcedOpen(predInput)
+
 			tokenCallback := func(s string, c *[]schema.Choice) {
 				// Extract reasoning from the response
-				reasoning, cleanedS := functions.ExtractReasoning(s)
-				s = cleanedS
+				var extractedReasoning string
+				opts := []reasoning.Option{}
+				if nonStreamThinkingForcedOpen {
+					opts = append(opts, reasoning.WithThinkingForcedOpen())
+				}
+				extractedReasoning, s = reasoning.Extract(s, opts...)

 				if !shouldUseFn {
 					// no function is called, just reply and use stop as finish reason
 					stopReason := FinishReasonStop
 					message := &schema.Message{Role: "assistant", Content: &s}
-					if reasoning != "" {
-						message.Reasoning = &reasoning
+					if extractedReasoning != "" {
+						message.Reasoning = &extractedReasoning
 					}
 					*c = append(*c, schema.Choice{FinishReason: &stopReason, Index: 0, Message: message})
 					return
@@ -650,8 +673,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator

 					stopReason := FinishReasonStop
 					message := &schema.Message{Role: "assistant", Content: &result}
-					if reasoning != "" {
-						message.Reasoning = &reasoning
+					if extractedReasoning != "" {
+						message.Reasoning = &extractedReasoning
 					}
 					*c = append(*c, schema.Choice{
 						FinishReason: &stopReason,
@@ -664,8 +687,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 							Role: "assistant",
 						},
 					}
-					if reasoning != "" {
-						toolChoice.Message.Reasoning = &reasoning
+					if extractedReasoning != "" {
+						toolChoice.Message.Reasoning = &extractedReasoning
 					}

 					for _, ss := range results {
@@ -695,8 +718,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 									"arguments": args,
 								},
 							}
-							if reasoning != "" {
-								message.Reasoning = &reasoning
+							if extractedReasoning != "" {
+								message.Reasoning = &extractedReasoning
 							}
 							*c = append(*c, schema.Choice{
 								FinishReason: &functionCallReason,
--- a/docs/content/features/text-generation.md
+++ b/docs/content/features/text-generation.md
@@ -72,6 +72,359 @@ You can list all the models available with:
 curl http://localhost:8080/v1/models
 ```

+### Anthropic Messages API
+
+LocalAI supports the Anthropic Messages API, which is compatible with Claude clients. This endpoint provides a structured way to send messages and receive responses, with support for tools, streaming, and multimodal content.
+
+**Endpoint:** `POST /v1/messages` or `POST /messages`
+
+**Reference:** https://docs.anthropic.com/claude/reference/messages_post
+
+#### Basic Usage
+
+```bash
+curl http://localhost:8080/v1/messages \
+  -H "Content-Type: application/json" \
+  -H "anthropic-version: 2023-06-01" \
+  -d '{
+    "model": "ggml-koala-7b-model-q4_0-r2.bin",
+    "max_tokens": 1024,
+    "messages": [
+      {"role": "user", "content": "Say this is a test!"}
+    ]
+  }'
+```
+
+#### Request Parameters
+
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| `model` | string | Yes | The model identifier |
+| `messages` | array | Yes | Array of message objects with `role` and `content` |
+| `max_tokens` | integer | Yes | Maximum number of tokens to generate (must be > 0) |
+| `system` | string | No | System message to set the assistant's behavior |
+| `temperature` | float | No | Sampling temperature (0.0 to 1.0) |
+| `top_p` | float | No | Nucleus sampling parameter |
+| `top_k` | integer | No | Top-k sampling parameter |
+| `stop_sequences` | array | No | Array of strings that will stop generation |
+| `stream` | boolean | No | Enable streaming responses |
+| `tools` | array | No | Array of tool definitions for function calling |
+| `tool_choice` | string/object | No | Tool choice strategy: "auto", "any", "none", or specific tool |
+| `metadata` | object | No | Custom metadata to attach to the request |
+
+#### Message Format
+
+Messages can contain text or structured content blocks:
+
+```bash
+curl http://localhost:8080/v1/messages \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "ggml-koala-7b-model-q4_0-r2.bin",
+    "max_tokens": 1024,
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {
+            "type": "text",
+            "text": "What is in this image?"
+          },
+          {
+            "type": "image",
+            "source": {
+              "type": "base64",
+              "media_type": "image/jpeg",
+              "data": "base64_encoded_image_data"
+            }
+          }
+        ]
+      }
+    ]
+  }'
+```
+
+#### Tool Calling
+
+The Anthropic API supports function calling through tools:
+
+```bash
+curl http://localhost:8080/v1/messages \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "ggml-koala-7b-model-q4_0-r2.bin",
+    "max_tokens": 1024,
+    "tools": [
+      {
+        "name": "get_weather",
+        "description": "Get the current weather",
+        "input_schema": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The city and state"
+            }
+          },
+          "required": ["location"]
+        }
+      }
+    ],
+    "tool_choice": "auto",
+    "messages": [
+      {"role": "user", "content": "What is the weather in San Francisco?"}
+    ]
+  }'
+```
+
+#### Streaming
+
+Enable streaming responses by setting `stream: true`:
+
+```bash
+curl http://localhost:8080/v1/messages \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "ggml-koala-7b-model-q4_0-r2.bin",
+    "max_tokens": 1024,
+    "stream": true,
+    "messages": [
+      {"role": "user", "content": "Tell me a story"}
+    ]
+  }'
+```
+
+Streaming responses use Server-Sent Events (SSE) format with event types: `message_start`, `content_block_start`, `content_block_delta`, `content_block_stop`, `message_delta`, and `message_stop`.
+
+#### Response Format
+
+```json
+{
+  "id": "msg_abc123",
+  "type": "message",
+  "role": "assistant",
+  "content": [
+    {
+      "type": "text",
+      "text": "This is a test!"
+    }
+  ],
+  "model": "ggml-koala-7b-model-q4_0-r2.bin",
+  "stop_reason": "end_turn",
+  "usage": {
+    "input_tokens": 10,
+    "output_tokens": 5
+  }
+}
+```
+
+### Open Responses API
+
+LocalAI supports the Open Responses API specification, which provides a standardized interface for AI model interactions with support for background processing, streaming, tool calling, and advanced features like reasoning.
+
+**Endpoint:** `POST /v1/responses` or `POST /responses`
+
+**Reference:** https://www.openresponses.org/specification
+
+#### Basic Usage
+
+```bash
+curl http://localhost:8080/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "ggml-koala-7b-model-q4_0-r2.bin",
+    "input": "Say this is a test!",
+    "max_output_tokens": 1024
+  }'
+```
+
+#### Request Parameters
+
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| `model` | string | Yes | The model identifier |
+| `input` | string/array | Yes | Input text or array of input items |
+| `max_output_tokens` | integer | No | Maximum number of tokens to generate |
+| `temperature` | float | No | Sampling temperature |
+| `top_p` | float | No | Nucleus sampling parameter |
+| `instructions` | string | No | System instructions |
+| `tools` | array | No | Array of tool definitions |
+| `tool_choice` | string/object | No | Tool choice: "auto", "required", "none", or specific tool |
+| `stream` | boolean | No | Enable streaming responses |
+| `background` | boolean | No | Run request in background (returns immediately) |
+| `store` | boolean | No | Whether to store the response |
+| `reasoning` | object | No | Reasoning configuration with `effort` and `summary` |
+| `parallel_tool_calls` | boolean | No | Allow parallel tool calls |
+| `max_tool_calls` | integer | No | Maximum number of tool calls |
+| `presence_penalty` | float | No | Presence penalty (-2.0 to 2.0) |
+| `frequency_penalty` | float | No | Frequency penalty (-2.0 to 2.0) |
+| `top_logprobs` | integer | No | Number of top logprobs to return |
+| `truncation` | string | No | Truncation mode: "auto" or "disabled" |
+| `text_format` | object | No | Text format configuration |
+| `metadata` | object | No | Custom metadata |
+
+#### Input Format
+
+Input can be a simple string or an array of structured items:
+
+```bash
+curl http://localhost:8080/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "ggml-koala-7b-model-q4_0-r2.bin",
+    "input": [
+      {
+        "type": "message",
+        "role": "user",
+        "content": "What is the weather?"
+      }
+    ],
+    "max_output_tokens": 1024
+  }'
+```
+
+#### Background Processing
+
+Run requests in the background for long-running tasks:
+
+```bash
+curl http://localhost:8080/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "ggml-koala-7b-model-q4_0-r2.bin",
+    "input": "Generate a long story",
+    "max_output_tokens": 4096,
+    "background": true
+  }'
+```
+
+The response will include a response ID that can be used to poll for completion:
+
+```json
+{
+  "id": "resp_abc123",
+  "object": "response",
+  "status": "in_progress",
+  "created_at": 1234567890
+}
+```
+
+#### Retrieving Background Responses
+
+Use the GET endpoint to retrieve background responses:
+
+```bash
+# Get response by ID
+curl http://localhost:8080/v1/responses/resp_abc123
+
+# Resume streaming with query parameters
+curl "http://localhost:8080/v1/responses/resp_abc123?stream=true&starting_after=10"
+```
+
+#### Canceling Background Responses
+
+Cancel a background response that's still in progress:
+
+```bash
+curl -X POST http://localhost:8080/v1/responses/resp_abc123/cancel
+```
+
+#### Tool Calling
+
+Open Responses API supports function calling with tools:
+
+```bash
+curl http://localhost:8080/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "ggml-koala-7b-model-q4_0-r2.bin",
+    "input": "What is the weather in San Francisco?",
+    "tools": [
+      {
+        "type": "function",
+        "name": "get_weather",
+        "description": "Get the current weather",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The city and state"
+            }
+          },
+          "required": ["location"]
+        }
+      }
+    ],
+    "tool_choice": "auto",
+    "max_output_tokens": 1024
+  }'
+```
+
+#### Reasoning Configuration
+
+Configure reasoning effort and summary style:
+
+```bash
+curl http://localhost:8080/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "ggml-koala-7b-model-q4_0-r2.bin",
+    "input": "Solve this complex problem step by step",
+    "reasoning": {
+      "effort": "high",
+      "summary": "detailed"
+    },
+    "max_output_tokens": 2048
+  }'
+```
+
+#### Response Format
+
+```json
+{
+  "id": "resp_abc123",
+  "object": "response",
+  "created_at": 1234567890,
+  "completed_at": 1234567895,
+  "status": "completed",
+  "model": "ggml-koala-7b-model-q4_0-r2.bin",
+  "output": [
+    {
+      "type": "message",
+      "id": "msg_001",
+      "role": "assistant",
+      "content": [
+        {
+          "type": "output_text",
+          "text": "This is a test!",
+          "annotations": [],
+          "logprobs": []
+        }
+      ],
+      "status": "completed"
+    }
+  ],
+  "error": null,
+  "incomplete_details": null,
+  "temperature": 0.7,
+  "top_p": 1.0,
+  "presence_penalty": 0.0,
+  "frequency_penalty": 0.0,
+  "usage": {
+    "input_tokens": 10,
+    "output_tokens": 5,
+    "total_tokens": 15,
+    "input_tokens_details": {
+      "cached_tokens": 0
+    },
+    "output_tokens_details": {
+      "reasoning_tokens": 0
+    }
+  }
+}
+```
+
 ## Backends

 ### RWKV
--- a/docs/content/getting-started/try-it-out.md
+++ b/docs/content/getting-started/try-it-out.md
@@ -112,6 +112,66 @@ curl http://localhost:8080/v1/chat/completions \

 </details>

+### Anthropic Messages API
+
+LocalAI supports the Anthropic Messages API for Claude-compatible models. [Anthropic documentation](https://docs.anthropic.com/claude/reference/messages_post).
+
+<details>
+
+```bash
+curl http://localhost:8080/v1/messages \
+  -H "Content-Type: application/json" \
+  -H "anthropic-version: 2023-06-01" \
+  -d '{
+    "model": "gpt-4",
+    "max_tokens": 1024,
+    "messages": [
+      {"role": "user", "content": "How are you doing?"}
+    ],
+    "temperature": 0.7
+  }'
+```
+
+</details>
+
+### Open Responses API
+
+LocalAI supports the Open Responses API specification with support for background processing, streaming, and advanced features. [Open Responses documentation](https://www.openresponses.org/specification).
+
+<details>
+
+```bash
+curl http://localhost:8080/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4",
+    "input": "Say this is a test!",
+    "max_output_tokens": 1024,
+    "temperature": 0.7
+  }'
+```
+
+For background processing:
+
+```bash
+curl http://localhost:8080/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4",
+    "input": "Generate a long story",
+    "max_output_tokens": 4096,
+    "background": true
+  }'
+```
+
+Then retrieve the response:
+
+```bash
+curl http://localhost:8080/v1/responses/<response_id>
+```
+
+</details>
+
 ### Image Generation

 Creates an image given a prompt. [OpenAI documentation](https://platform.openai.com/docs/api-reference/images/create).
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v3.9.0"
+  "version": "v3.10.0"
 }
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -29,6 +29,7 @@

    This description emphasizes its capabilities, efficiency, and versatility for multimodal search tasks.
  overrides:
+    reranking: true
    parameters:
      model: llama-cpp/models/Qwen3-VL-Reranker-8B.Q4_K_M.gguf
    name: Qwen3-VL-Reranker-8B-GGUF
--- a/go.mod
+++ b/go.mod
@@ -32,7 +32,7 @@ require (
 	github.com/mholt/archiver/v3 v3.5.1
 	github.com/microcosm-cc/bluemonday v1.0.27
 	github.com/modelcontextprotocol/go-sdk v1.2.0
-	github.com/mudler/cogito v0.7.2
+	github.com/mudler/cogito v0.8.1
 	github.com/mudler/edgevpn v0.31.1
 	github.com/mudler/go-processmanager v0.1.0
 	github.com/mudler/memory v0.0.0-20251216220809-d1256471a6c2
--- a/go.sum
+++ b/go.sum
@@ -507,8 +507,8 @@ github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7P
 github.com/mr-tron/base58 v1.1.2/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc=
 github.com/mr-tron/base58 v1.2.0 h1:T/HDJBh4ZCPbU39/+c3rRvE0uKBQlU27+QI8LJ4t64o=
 github.com/mr-tron/base58 v1.2.0/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc=
-github.com/mudler/cogito v0.7.2 h1:J5eHZPsxpoKcnYUfogje5u0nnzGww7ytv7nSn1DMpms=
-github.com/mudler/cogito v0.7.2/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
+github.com/mudler/cogito v0.8.1 h1:66qPJkAMrq/Vo8AC/PvXWuVxYPhi7X2DQuJIilL8+3I=
+github.com/mudler/cogito v0.8.1/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
 github.com/mudler/edgevpn v0.31.1 h1:7qegiDWd0kAg6ljhNHxqvp8hbo/6BbzSdbb7/2WZfiY=
 github.com/mudler/edgevpn v0.31.1/go.mod h1:ftV5B0nKFzm4R8vR80UYnCb2nf7lxCRgAALxUEEgCf8=
 github.com/mudler/go-piper v0.0.0-20241023091659-2494246fd9fc h1:RxwneJl1VgvikiX28EkpdAyL4yQVnJMrbquKospjHyA=
--- a/pkg/functions/reasoning.go
+++ b/pkg/functions/reasoning.go
@@ -1,114 +0,0 @@
-package functions
-
-import (
-	"strings"
-)
-
-// ExtractReasoning extracts reasoning content from thinking tags and returns
-// both the extracted reasoning and the cleaned content (with tags removed).
-// It handles <thinking>...</thinking> and <think>...</think> tags.
-// Multiple reasoning blocks are concatenated with newlines.
-func ExtractReasoning(content string) (reasoning string, cleanedContent string) {
-	if content == "" {
-		return "", content
-	}
-
-	var reasoningParts []string
-	var cleanedParts []string
-	remaining := content
-
-	// Define tag pairs to look for
-	tagPairs := []struct {
-		start string
-		end   string
-	}{
-		{"<thinking>", "</thinking>"},
-		{"<think>", "</think>"},
-	}
-
-	// Track the last position we've processed
-	lastPos := 0
-
-	for {
-		// Find the earliest tag start
-		earliestStart := -1
-		earliestEnd := -1
-		isUnclosed := false
-		var matchedTag struct {
-			start string
-			end   string
-		}
-
-		for _, tagPair := range tagPairs {
-			startIdx := strings.Index(remaining[lastPos:], tagPair.start)
-			if startIdx == -1 {
-				continue
-			}
-			startIdx += lastPos
-
-			// Find the corresponding end tag
-			endIdx := strings.Index(remaining[startIdx+len(tagPair.start):], tagPair.end)
-			if endIdx == -1 {
-				// Unclosed tag - extract what we have
-				if earliestStart == -1 || startIdx < earliestStart {
-					earliestStart = startIdx
-					earliestEnd = len(remaining)
-					isUnclosed = true
-					matchedTag = tagPair
-				}
-				continue
-			}
-			endIdx += startIdx + len(tagPair.start)
-
-			// Found a complete tag pair
-			if earliestStart == -1 || startIdx < earliestStart {
-				earliestStart = startIdx
-				earliestEnd = endIdx + len(tagPair.end)
-				isUnclosed = false
-				matchedTag = tagPair
-			}
-		}
-
-		if earliestStart == -1 {
-			// No more tags found, add remaining content
-			if lastPos < len(remaining) {
-				cleanedParts = append(cleanedParts, remaining[lastPos:])
-			}
-			break
-		}
-
-		// Add content before the tag
-		if earliestStart > lastPos {
-			cleanedParts = append(cleanedParts, remaining[lastPos:earliestStart])
-		}
-
-		// Extract reasoning content
-		reasoningStart := earliestStart + len(matchedTag.start)
-		// For unclosed tags, earliestEnd is already at the end of the string
-		// For closed tags, earliestEnd points to after the closing tag, so we subtract the end tag length
-		var reasoningEnd int
-		if isUnclosed {
-			// Unclosed tag - extract everything to the end
-			reasoningEnd = len(remaining)
-		} else {
-			// Closed tag - exclude the end tag
-			reasoningEnd = earliestEnd - len(matchedTag.end)
-		}
-		if reasoningEnd > reasoningStart {
-			reasoningContent := strings.TrimSpace(remaining[reasoningStart:reasoningEnd])
-			if reasoningContent != "" {
-				reasoningParts = append(reasoningParts, reasoningContent)
-			}
-		}
-
-		// Move past this tag
-		lastPos = earliestEnd
-	}
-
-	// Combine reasoning parts
-	reasoning = strings.Join(reasoningParts, "\n\n")
-	// Combine cleaned content parts
-	cleanedContent = strings.Join(cleanedParts, "")
-
-	return reasoning, cleanedContent
-}
--- a/pkg/functions/reasoning_test.go
+++ b/pkg/functions/reasoning_test.go
@@ -1,261 +0,0 @@
-package functions_test
-
-import (
-	"strings"
-
-	. "github.com/mudler/LocalAI/pkg/functions"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("ExtractReasoning", func() {
-	Context("when content has no reasoning tags", func() {
-		It("should return empty reasoning and original content", func() {
-			content := "This is regular content without any tags."
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(Equal(content))
-		})
-
-		It("should handle empty string", func() {
-			content := ""
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(BeEmpty())
-		})
-
-		It("should handle content with only whitespace", func() {
-			content := "   \n\t  "
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(Equal(content))
-		})
-	})
-
-	Context("when content has <thinking> tags", func() {
-		It("should extract reasoning from single thinking block", func() {
-			content := "Some text <thinking>This is my reasoning</thinking> More text"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("This is my reasoning"))
-			Expect(cleaned).To(Equal("Some text  More text"))
-		})
-
-		It("should extract reasoning and preserve surrounding content", func() {
-			content := "Before <thinking>Reasoning here</thinking> After"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Reasoning here"))
-			Expect(cleaned).To(Equal("Before  After"))
-		})
-
-		It("should handle thinking block at the start", func() {
-			content := "<thinking>Start reasoning</thinking> Regular content"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Start reasoning"))
-			Expect(cleaned).To(Equal(" Regular content"))
-		})
-
-		It("should handle thinking block at the end", func() {
-			content := "Regular content <thinking>End reasoning</thinking>"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("End reasoning"))
-			Expect(cleaned).To(Equal("Regular content "))
-		})
-
-		It("should handle only thinking block", func() {
-			content := "<thinking>Only reasoning</thinking>"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Only reasoning"))
-			Expect(cleaned).To(BeEmpty())
-		})
-
-		It("should trim whitespace from reasoning content", func() {
-			content := "Text <thinking>  \n  Reasoning with spaces  \n  </thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Reasoning with spaces"))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-	})
-
-	Context("when content has <think> tags", func() {
-		It("should extract reasoning from redacted_reasoning block", func() {
-			content := "Text <think>Redacted reasoning</think> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Redacted reasoning"))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-
-		It("should handle redacted_reasoning with multiline content", func() {
-			content := "Before <think>Line 1\nLine 2\nLine 3</think> After"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Line 1\nLine 2\nLine 3"))
-			Expect(cleaned).To(Equal("Before  After"))
-		})
-
-		It("should handle redacted_reasoning with complex content", func() {
-			content := "Start <think>Complex reasoning\nwith\nmultiple\nlines</think> End"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Complex reasoning\nwith\nmultiple\nlines"))
-			Expect(cleaned).To(Equal("Start  End"))
-		})
-	})
-
-	Context("when content has multiple reasoning blocks", func() {
-		It("should concatenate multiple thinking blocks with newlines", func() {
-			content := "Text <thinking>First</thinking> Middle <thinking>Second</thinking> End"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("First\n\nSecond"))
-			Expect(cleaned).To(Equal("Text  Middle  End"))
-		})
-
-		It("should handle multiple different tag types", func() {
-			content := "A <thinking>One</thinking> B <think>Two</think> C <think>Three</think> D"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(ContainSubstring("One"))
-			Expect(reasoning).To(ContainSubstring("Two"))
-			Expect(reasoning).To(ContainSubstring("Three"))
-			Expect(cleaned).To(Equal("A  B  C  D"))
-		})
-
-		It("should handle nested tags correctly (extracts first match)", func() {
-			content := "Text <thinking>Outer <think>Inner</think></thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			// Should extract the outer thinking block
-			Expect(reasoning).To(ContainSubstring("Outer"))
-			Expect(reasoning).To(ContainSubstring("Inner"))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-	})
-
-	Context("when content has unclosed reasoning tags", func() {
-		It("should extract unclosed thinking block", func() {
-			content := "Text <thinking>Unclosed reasoning"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Unclosed reasoning"))
-			Expect(cleaned).To(Equal("Text "))
-		})
-
-		It("should extract unclosed think block", func() {
-			content := "Before <think>Incomplete"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Incomplete"))
-			Expect(cleaned).To(Equal("Before "))
-		})
-
-		It("should extract unclosed redacted_reasoning block", func() {
-			content := "Start <think>Partial reasoning content"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Partial reasoning content"))
-			Expect(cleaned).To(Equal("Start "))
-		})
-
-		It("should handle unclosed tag at the end", func() {
-			content := "Regular content <thinking>Unclosed at end"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Unclosed at end"))
-			Expect(cleaned).To(Equal("Regular content "))
-		})
-	})
-
-	Context("when content has empty reasoning blocks", func() {
-		It("should ignore empty thinking block", func() {
-			content := "Text <thinking></thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-
-		It("should ignore thinking block with only whitespace", func() {
-			content := "Text <thinking>   \n\t  </thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-	})
-
-	Context("when content has reasoning tags with special characters", func() {
-		It("should handle reasoning with newlines", func() {
-			content := "Before <thinking>Line 1\nLine 2\nLine 3</thinking> After"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Line 1\nLine 2\nLine 3"))
-			Expect(cleaned).To(Equal("Before  After"))
-		})
-
-		It("should handle reasoning with code blocks", func() {
-			content := "Text <thinking>Reasoning with ```code``` blocks</thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Reasoning with ```code``` blocks"))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-
-		It("should handle reasoning with JSON", func() {
-			content := "Before <think>{\"key\": \"value\"}</think> After"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("{\"key\": \"value\"}"))
-			Expect(cleaned).To(Equal("Before  After"))
-		})
-
-		It("should handle reasoning with HTML-like content", func() {
-			content := "Text <thinking>Reasoning with <tags> inside</thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Reasoning with <tags> inside"))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-	})
-
-	Context("when content has reasoning mixed with regular content", func() {
-		It("should preserve content order correctly", func() {
-			content := "Start <thinking>Reasoning</thinking> Middle <think>More reasoning</think> End"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(ContainSubstring("Reasoning"))
-			Expect(reasoning).To(ContainSubstring("More reasoning"))
-			Expect(cleaned).To(Equal("Start  Middle  End"))
-		})
-
-		It("should handle reasoning in the middle of a sentence", func() {
-			content := "This is a <thinking>reasoning</thinking> sentence."
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("reasoning"))
-			Expect(cleaned).To(Equal("This is a  sentence."))
-		})
-	})
-
-	Context("edge cases", func() {
-		It("should handle content with only opening tag", func() {
-			content := "<thinking>"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(Equal(""))
-		})
-
-		It("should handle content with only closing tag", func() {
-			content := "</thinking>"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(Equal("</thinking>"))
-		})
-
-		It("should handle mismatched tags", func() {
-			content := "<thinking>Content</think>"
-			reasoning, cleaned := ExtractReasoning(content)
-			// Should extract unclosed thinking block
-			Expect(reasoning).To(ContainSubstring("Content"))
-			Expect(cleaned).To(Equal(""))
-		})
-
-		It("should handle very long reasoning content", func() {
-			longReasoning := strings.Repeat("This is reasoning content. ", 100)
-			content := "Text <thinking>" + longReasoning + "</thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			// TrimSpace is applied, so we need to account for that
-			Expect(reasoning).To(Equal(strings.TrimSpace(longReasoning)))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-
-		It("should handle reasoning with unicode characters", func() {
-			content := "Text <thinking>Reasoning with 中文 and emoji 🧠</thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Reasoning with 中文 and emoji 🧠"))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-	})
-})
--- a/pkg/reasoning/config.go
+++ b/pkg/reasoning/config.go
@@ -0,0 +1,8 @@
+package reasoning
+
+type ReasoningConfig struct {
+	// ThinkingForcedOpen indicates that the model outputs reasoning without an opening tag.
+	// When true, all content from the start is treated as reasoning until a closing tag is found.
+	// This is useful for models like GLM-4 that output reasoning without <think> but end with </think>.
+	ThinkingForcedOpen bool `yaml:"thinking_forced_open,omitempty" json:"thinking_forced_open,omitempty"`
+}
--- a/pkg/reasoning/options.go
+++ b/pkg/reasoning/options.go
@@ -0,0 +1,18 @@
+package reasoning
+
+// options holds the configuration for reasoning extraction
+type options struct {
+	thinkingForcedOpen bool
+}
+
+// Option is a functional option for configuring reasoning extraction
+type Option func(*options)
+
+// WithThinkingForcedOpen configures the extractor to treat all content from the start
+// as reasoning until a closing tag is found. This is useful for models like GLM-4
+// that output reasoning without <think> but end with </think>.
+func WithThinkingForcedOpen() Option {
+	return func(o *options) {
+		o.thinkingForcedOpen = true
+	}
+}
--- a/pkg/reasoning/reasoning.go
+++ b/pkg/reasoning/reasoning.go
@@ -0,0 +1,256 @@
+package reasoning
+
+import (
+	"strings"
+)
+
+// Common thinking/reasoning opening tags used by various models.
+// These match the tags detected by llama.cpp in common/chat.cpp
+var thinkingOpenTags = []string{
+	// DeepSeek R1, V3.1, Nemotron V2, MiniMax M2, Hermes 2 Pro, Granite, Exaone MOE
+	"<think>\n",
+	"<think>",
+	// Generic thinking tags
+	"<thinking>\n",
+	"<thinking>",
+	// Apertus
+	"<|inner_prefix|>",
+	// Command R7B
+	"<|START_THINKING|>",
+	// Seed
+	"<seed:think>",
+	// Magistral (not in llama.cpp but common)
+	"[THINK]\n",
+	"[THINK]",
+}
+
+// DetectThinkingForcedOpen checks if a prompt ends with a thinking opening tag.
+// This is used to automatically detect when the model template has already added
+// the opening thinking tag, meaning the model will output reasoning content directly.
+// Returns true if the prompt ends with a known thinking opening tag.
+func DetectThinkingForcedOpen(prompt string) bool {
+	for _, tag := range thinkingOpenTags {
+		if strings.HasSuffix(prompt, tag) {
+			return true
+		}
+	}
+	return false
+}
+
+// Extract extracts reasoning content from thinking tags and returns
+// both the extracted reasoning and the cleaned content (with tags removed).
+// It handles <thinking>...</thinking> and <think>...</think> tags.
+// Multiple reasoning blocks are concatenated with newlines.
+// It also handles the case where only a closing tag is present (no opening tag),
+// in which case everything before the closing tag is treated as reasoning.
+//
+// Use WithThinkingForcedOpen() option when all content from the start should be
+// treated as reasoning until a closing tag is found.
+func Extract(content string, opts ...Option) (reasoning string, cleanedContent string) {
+	if content == "" {
+		return "", content
+	}
+
+	cfg := &options{}
+	for _, opt := range opts {
+		opt(cfg)
+	}
+
+	if cfg.thinkingForcedOpen {
+		return extractForcedOpen(content)
+	}
+
+	return extractFromTags(content)
+}
+
+// extractForcedOpen handles the case where reasoning starts without an opening tag.
+// All content from the start is treated as reasoning until a closing tag is found.
+func extractForcedOpen(content string) (reasoning string, cleanedContent string) {
+	// Look for the earliest closing tag
+	// These match the closing tags used by llama.cpp for various models
+	closingTags := []string{
+		"</thinking>",
+		"</think>",
+		"<|END_THINKING|>", // Command R7B
+		"<|inner_suffix|>", // Apertus
+		"</seed:think>",    // Seed
+		"[/THINK]",         // Magistral
+	}
+
+	earliestCloseIdx := -1
+	var matchedCloseTag string
+
+	for _, closeTag := range closingTags {
+		idx := strings.Index(content, closeTag)
+		if idx != -1 && (earliestCloseIdx == -1 || idx < earliestCloseIdx) {
+			earliestCloseIdx = idx
+			matchedCloseTag = closeTag
+		}
+	}
+
+	if earliestCloseIdx == -1 {
+		// No closing tag found - all content is reasoning (still streaming)
+		return strings.TrimSpace(content), ""
+	}
+
+	// Found closing tag - everything before is reasoning, everything after is content
+	reasoning = strings.TrimSpace(content[:earliestCloseIdx])
+	cleanedContent = content[earliestCloseIdx+len(matchedCloseTag):]
+
+	// Continue processing the rest for any additional reasoning blocks
+	if cleanedContent != "" {
+		additionalReasoning, finalContent := extractFromTags(cleanedContent)
+		if additionalReasoning != "" {
+			if reasoning != "" {
+				reasoning = reasoning + "\n\n" + additionalReasoning
+			} else {
+				reasoning = additionalReasoning
+			}
+		}
+		cleanedContent = finalContent
+	}
+
+	return reasoning, cleanedContent
+}
+
+// extractFromTags extracts reasoning content from thinking tags.
+// This is the core implementation that handles standard tag-based extraction.
+func extractFromTags(content string) (reasoning string, cleanedContent string) {
+	if content == "" {
+		return "", content
+	}
+
+	var reasoningParts []string
+	var cleanedParts []string
+	remaining := content
+
+	// Define tag pairs to look for
+	// These match the tags used by llama.cpp for various models
+	tagPairs := []struct {
+		start string
+		end   string
+	}{
+		{"<thinking>", "</thinking>"},
+		{"<think>", "</think>"},
+		{"<|START_THINKING|>", "<|END_THINKING|>"}, // Command R7B
+		{"<|inner_prefix|>", "<|inner_suffix|>"},   // Apertus
+		{"<seed:think>", "</seed:think>"},          // Seed
+		{"[THINK]", "[/THINK]"},                    // Magistral
+	}
+
+	// Track the last position we've processed
+	lastPos := 0
+
+	for {
+		// Find the earliest tag start
+		earliestStart := -1
+		earliestEnd := -1
+		isUnclosed := false
+		isClosingOnly := false
+		var matchedTag struct {
+			start string
+			end   string
+		}
+
+		for _, tagPair := range tagPairs {
+			startIdx := strings.Index(remaining[lastPos:], tagPair.start)
+			endIdx := strings.Index(remaining[lastPos:], tagPair.end)
+
+			// Check for closing-only tag (closing tag appears before or without opening tag)
+			if endIdx != -1 && (startIdx == -1 || endIdx < startIdx) {
+				// Found a closing tag without a preceding opening tag
+				closingTagPos := endIdx + lastPos
+				if earliestStart == -1 || closingTagPos < earliestStart || (isClosingOnly && closingTagPos < earliestEnd) {
+					earliestStart = lastPos
+					earliestEnd = closingTagPos + len(tagPair.end)
+					isClosingOnly = true
+					isUnclosed = false
+					matchedTag = tagPair
+				}
+				continue
+			}
+
+			if startIdx == -1 {
+				continue
+			}
+			startIdx += lastPos
+
+			// Find the corresponding end tag after the start tag
+			endIdxAfterStart := strings.Index(remaining[startIdx+len(tagPair.start):], tagPair.end)
+			if endIdxAfterStart == -1 {
+				// Unclosed tag - extract what we have
+				if earliestStart == -1 || startIdx < earliestStart {
+					earliestStart = startIdx
+					earliestEnd = len(remaining)
+					isUnclosed = true
+					isClosingOnly = false
+					matchedTag = tagPair
+				}
+				continue
+			}
+			endIdxAfterStart += startIdx + len(tagPair.start)
+
+			// Found a complete tag pair
+			if earliestStart == -1 || startIdx < earliestStart {
+				earliestStart = startIdx
+				earliestEnd = endIdxAfterStart + len(tagPair.end)
+				isUnclosed = false
+				isClosingOnly = false
+				matchedTag = tagPair
+			}
+		}
+
+		if earliestStart == -1 {
+			// No more tags found, add remaining content
+			if lastPos < len(remaining) {
+				cleanedParts = append(cleanedParts, remaining[lastPos:])
+			}
+			break
+		}
+
+		if isClosingOnly {
+			// Closing tag without opening tag - content before closing tag is reasoning
+			reasoningContent := strings.TrimSpace(remaining[lastPos : earliestEnd-len(matchedTag.end)])
+			if reasoningContent != "" {
+				reasoningParts = append(reasoningParts, reasoningContent)
+			}
+			// Move past the closing tag
+			lastPos = earliestEnd
+			continue
+		}
+
+		// Add content before the tag
+		if earliestStart > lastPos {
+			cleanedParts = append(cleanedParts, remaining[lastPos:earliestStart])
+		}
+
+		// Extract reasoning content
+		reasoningStart := earliestStart + len(matchedTag.start)
+		// For unclosed tags, earliestEnd is already at the end of the string
+		// For closed tags, earliestEnd points to after the closing tag, so we subtract the end tag length
+		var reasoningEnd int
+		if isUnclosed {
+			// Unclosed tag - extract everything to the end
+			reasoningEnd = len(remaining)
+		} else {
+			// Closed tag - exclude the end tag
+			reasoningEnd = earliestEnd - len(matchedTag.end)
+		}
+		if reasoningEnd > reasoningStart {
+			reasoningContent := strings.TrimSpace(remaining[reasoningStart:reasoningEnd])
+			if reasoningContent != "" {
+				reasoningParts = append(reasoningParts, reasoningContent)
+			}
+		}
+
+		// Move past this tag
+		lastPos = earliestEnd
+	}
+
+	// Combine reasoning parts
+	reasoning = strings.Join(reasoningParts, "\n\n")
+	// Combine cleaned content parts
+	cleanedContent = strings.Join(cleanedParts, "")
+
+	return reasoning, cleanedContent
+}
--- a/pkg/reasoning/reasoning_suite_test.go
+++ b/pkg/reasoning/reasoning_suite_test.go
@@ -0,0 +1,13 @@
+package reasoning_test
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestReasoning(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Reasoning Suite")
+}
--- a/pkg/reasoning/reasoning_test.go
+++ b/pkg/reasoning/reasoning_test.go
@@ -0,0 +1,499 @@
+package reasoning_test
+
+import (
+	"strings"
+
+	. "github.com/mudler/LocalAI/pkg/reasoning"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("DetectThinkingForcedOpen", func() {
+	It("should detect <think> at end of prompt", func() {
+		Expect(DetectThinkingForcedOpen("Some prompt<think>")).To(BeTrue())
+		Expect(DetectThinkingForcedOpen("Some prompt<think>\n")).To(BeTrue())
+	})
+
+	It("should detect <thinking> at end of prompt", func() {
+		Expect(DetectThinkingForcedOpen("Some prompt<thinking>")).To(BeTrue())
+		Expect(DetectThinkingForcedOpen("Some prompt<thinking>\n")).To(BeTrue())
+	})
+
+	It("should detect model-specific tags", func() {
+		Expect(DetectThinkingForcedOpen("Some prompt<|inner_prefix|>")).To(BeTrue())
+		Expect(DetectThinkingForcedOpen("Some prompt<|START_THINKING|>")).To(BeTrue())
+		Expect(DetectThinkingForcedOpen("Some prompt<seed:think>")).To(BeTrue())
+		Expect(DetectThinkingForcedOpen("Some prompt[THINK]")).To(BeTrue())
+		Expect(DetectThinkingForcedOpen("Some prompt[THINK]\n")).To(BeTrue())
+	})
+
+	It("should not detect if tag is in the middle", func() {
+		Expect(DetectThinkingForcedOpen("Some <think> prompt")).To(BeFalse())
+		Expect(DetectThinkingForcedOpen("<think>reasoning</think>")).To(BeFalse())
+	})
+
+	It("should not detect if no thinking tag", func() {
+		Expect(DetectThinkingForcedOpen("Some regular prompt")).To(BeFalse())
+		Expect(DetectThinkingForcedOpen("")).To(BeFalse())
+	})
+})
+
+var _ = Describe("Extract", func() {
+	Context("when content has no reasoning tags", func() {
+		It("should return empty reasoning and original content", func() {
+			content := "This is regular content without any tags."
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(BeEmpty())
+			Expect(cleaned).To(Equal(content))
+		})
+
+		It("should handle empty string", func() {
+			content := ""
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(BeEmpty())
+			Expect(cleaned).To(BeEmpty())
+		})
+
+		It("should handle content with only whitespace", func() {
+			content := "   \n\t  "
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(BeEmpty())
+			Expect(cleaned).To(Equal(content))
+		})
+	})
+
+	Context("when content has <thinking> tags", func() {
+		It("should extract reasoning from single thinking block", func() {
+			content := "Some text <thinking>This is my reasoning</thinking> More text"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("This is my reasoning"))
+			Expect(cleaned).To(Equal("Some text  More text"))
+		})
+
+		It("should extract reasoning and preserve surrounding content", func() {
+			content := "Before <thinking>Reasoning here</thinking> After"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Reasoning here"))
+			Expect(cleaned).To(Equal("Before  After"))
+		})
+
+		It("should handle thinking block at the start", func() {
+			content := "<thinking>Start reasoning</thinking> Regular content"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Start reasoning"))
+			Expect(cleaned).To(Equal(" Regular content"))
+		})
+
+		It("should handle thinking block at the end", func() {
+			content := "Regular content <thinking>End reasoning</thinking>"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("End reasoning"))
+			Expect(cleaned).To(Equal("Regular content "))
+		})
+
+		It("should handle only thinking block", func() {
+			content := "<thinking>Only reasoning</thinking>"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Only reasoning"))
+			Expect(cleaned).To(BeEmpty())
+		})
+
+		It("should trim whitespace from reasoning content", func() {
+			content := "Text <thinking>  \n  Reasoning with spaces  \n  </thinking> More"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Reasoning with spaces"))
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+	})
+
+	Context("when content has <think> tags", func() {
+		It("should extract reasoning from redacted_reasoning block", func() {
+			content := "Text <think>Redacted reasoning</think> More"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Redacted reasoning"))
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+
+		It("should handle redacted_reasoning with multiline content", func() {
+			content := "Before <think>Line 1\nLine 2\nLine 3</think> After"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Line 1\nLine 2\nLine 3"))
+			Expect(cleaned).To(Equal("Before  After"))
+		})
+
+		It("should handle redacted_reasoning with complex content", func() {
+			content := "Start <think>Complex reasoning\nwith\nmultiple\nlines</think> End"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Complex reasoning\nwith\nmultiple\nlines"))
+			Expect(cleaned).To(Equal("Start  End"))
+		})
+	})
+
+	Context("when content has multiple reasoning blocks", func() {
+		It("should concatenate multiple thinking blocks with newlines", func() {
+			content := "Text <thinking>First</thinking> Middle <thinking>Second</thinking> End"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("First\n\nSecond"))
+			Expect(cleaned).To(Equal("Text  Middle  End"))
+		})
+
+		It("should handle multiple different tag types", func() {
+			content := "A <thinking>One</thinking> B <think>Two</think> C <think>Three</think> D"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(ContainSubstring("One"))
+			Expect(reasoning).To(ContainSubstring("Two"))
+			Expect(reasoning).To(ContainSubstring("Three"))
+			Expect(cleaned).To(Equal("A  B  C  D"))
+		})
+
+		It("should handle nested tags correctly (extracts first match)", func() {
+			content := "Text <thinking>Outer <think>Inner</think></thinking> More"
+			reasoning, cleaned := Extract(content)
+			// Should extract the outer thinking block
+			Expect(reasoning).To(ContainSubstring("Outer"))
+			Expect(reasoning).To(ContainSubstring("Inner"))
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+	})
+
+	Context("when content has unclosed reasoning tags", func() {
+		It("should extract unclosed thinking block", func() {
+			content := "Text <thinking>Unclosed reasoning"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Unclosed reasoning"))
+			Expect(cleaned).To(Equal("Text "))
+		})
+
+		It("should extract unclosed think block", func() {
+			content := "Before <think>Incomplete"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Incomplete"))
+			Expect(cleaned).To(Equal("Before "))
+		})
+
+		It("should extract unclosed redacted_reasoning block", func() {
+			content := "Start <think>Partial reasoning content"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Partial reasoning content"))
+			Expect(cleaned).To(Equal("Start "))
+		})
+
+		It("should handle unclosed tag at the end", func() {
+			content := "Regular content <thinking>Unclosed at end"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Unclosed at end"))
+			Expect(cleaned).To(Equal("Regular content "))
+		})
+	})
+
+	Context("when content has empty reasoning blocks", func() {
+		It("should ignore empty thinking block", func() {
+			content := "Text <thinking></thinking> More"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(BeEmpty())
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+
+		It("should ignore thinking block with only whitespace", func() {
+			content := "Text <thinking>   \n\t  </thinking> More"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(BeEmpty())
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+	})
+
+	Context("when content has reasoning tags with special characters", func() {
+		It("should handle reasoning with newlines", func() {
+			content := "Before <thinking>Line 1\nLine 2\nLine 3</thinking> After"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Line 1\nLine 2\nLine 3"))
+			Expect(cleaned).To(Equal("Before  After"))
+		})
+
+		It("should handle reasoning with code blocks", func() {
+			content := "Text <thinking>Reasoning with ```code``` blocks</thinking> More"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Reasoning with ```code``` blocks"))
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+
+		It("should handle reasoning with JSON", func() {
+			content := "Before <think>{\"key\": \"value\"}</think> After"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("{\"key\": \"value\"}"))
+			Expect(cleaned).To(Equal("Before  After"))
+		})
+
+		It("should handle reasoning with HTML-like content", func() {
+			content := "Text <thinking>Reasoning with <tags> inside</thinking> More"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Reasoning with <tags> inside"))
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+	})
+
+	Context("when content has reasoning mixed with regular content", func() {
+		It("should preserve content order correctly", func() {
+			content := "Start <thinking>Reasoning</thinking> Middle <think>More reasoning</think> End"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(ContainSubstring("Reasoning"))
+			Expect(reasoning).To(ContainSubstring("More reasoning"))
+			Expect(cleaned).To(Equal("Start  Middle  End"))
+		})
+
+		It("should handle reasoning in the middle of a sentence", func() {
+			content := "This is a <thinking>reasoning</thinking> sentence."
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("reasoning"))
+			Expect(cleaned).To(Equal("This is a  sentence."))
+		})
+	})
+
+	Context("edge cases without WithThinkingForcedOpen", func() {
+		It("should handle content with only opening tag", func() {
+			content := "<thinking>"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(BeEmpty())
+			Expect(cleaned).To(Equal(""))
+		})
+
+		It("should handle content with only closing tag (no content before)", func() {
+			content := "</thinking>"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(BeEmpty())
+			Expect(cleaned).To(BeEmpty())
+		})
+
+		It("should extract reasoning when only closing tag is present", func() {
+			// GLM-4 style: reasoning content followed by closing tag without opening tag
+			content := "This is reasoning content</think>this is the actual response"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("This is reasoning content"))
+			Expect(cleaned).To(Equal("this is the actual response"))
+		})
+
+		It("should handle closing-only tag with multiline reasoning", func() {
+			content := "1. First point\n2. Second point\n3. Third point</think>Final answer"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("1. First point\n2. Second point\n3. Third point"))
+			Expect(cleaned).To(Equal("Final answer"))
+		})
+
+		It("should handle closing-only tag with complex reasoning (GLM-4 example)", func() {
+			content := "**Analyze the user's input:** The user says something.\n\n**Final Decision:** Output the text.</think>this is a test"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("**Analyze the user's input:** The user says something.\n\n**Final Decision:** Output the text."))
+			Expect(cleaned).To(Equal("this is a test"))
+		})
+
+		It("should handle closing-only thinking tag", func() {
+			content := "Some reasoning here</thinking>actual content"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Some reasoning here"))
+			Expect(cleaned).To(Equal("actual content"))
+		})
+
+		It("should handle mismatched tags", func() {
+			content := "<thinking>Content</think>"
+			reasoning, cleaned := Extract(content)
+			// Should extract unclosed thinking block
+			Expect(reasoning).To(ContainSubstring("Content"))
+			Expect(cleaned).To(Equal(""))
+		})
+
+		It("should handle very long reasoning content", func() {
+			longReasoning := strings.Repeat("This is reasoning content. ", 100)
+			content := "Text <thinking>" + longReasoning + "</thinking> More"
+			reasoning, cleaned := Extract(content)
+			// TrimSpace is applied, so we need to account for that
+			Expect(reasoning).To(Equal(strings.TrimSpace(longReasoning)))
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+
+		It("should handle reasoning with unicode characters", func() {
+			content := "Text <thinking>Reasoning with 中文 and emoji 🧠</thinking> More"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Reasoning with 中文 and emoji 🧠"))
+			Expect(cleaned).To(Equal("Text  More"))
+		})
+	})
+
+	Context("with WithThinkingForcedOpen option", func() {
+		It("should treat all content as reasoning until closing tag", func() {
+			content := "This is reasoning</think>this is content"
+			reasoning, cleaned := Extract(content, WithThinkingForcedOpen())
+			Expect(reasoning).To(Equal("This is reasoning"))
+			Expect(cleaned).To(Equal("this is content"))
+		})
+
+		It("should treat all content as reasoning when no closing tag (streaming)", func() {
+			content := "This is reasoning content still streaming"
+			reasoning, cleaned := Extract(content, WithThinkingForcedOpen())
+			Expect(reasoning).To(Equal("This is reasoning content still streaming"))
+			Expect(cleaned).To(BeEmpty())
+		})
+
+		It("should handle GLM-4 style output", func() {
+			content := "**Analyze:** The user says something.\n\n**Final Decision:** Output the text.</think>this is a test"
+			reasoning, cleaned := Extract(content, WithThinkingForcedOpen())
+			Expect(reasoning).To(Equal("**Analyze:** The user says something.\n\n**Final Decision:** Output the text."))
+			Expect(cleaned).To(Equal("this is a test"))
+		})
+
+		It("should handle multiline reasoning with closing tag", func() {
+			content := "1. First point\n2. Second point\n3. Third point</think>Final answer"
+			reasoning, cleaned := Extract(content, WithThinkingForcedOpen())
+			Expect(reasoning).To(Equal("1. First point\n2. Second point\n3. Third point"))
+			Expect(cleaned).To(Equal("Final answer"))
+		})
+
+		It("should handle </thinking> closing tag", func() {
+			content := "Some reasoning here</thinking>actual content"
+			reasoning, cleaned := Extract(content, WithThinkingForcedOpen())
+			Expect(reasoning).To(Equal("Some reasoning here"))
+			Expect(cleaned).To(Equal("actual content"))
+		})
+
+		It("should handle additional reasoning blocks after initial forced open", func() {
+			content := "Initial reasoning</think>content<think>more reasoning</think>final content"
+			reasoning, cleaned := Extract(content, WithThinkingForcedOpen())
+			Expect(reasoning).To(Equal("Initial reasoning\n\nmore reasoning"))
+			Expect(cleaned).To(Equal("contentfinal content"))
+		})
+
+		It("should handle empty content", func() {
+			reasoning, cleaned := Extract("", WithThinkingForcedOpen())
+			Expect(reasoning).To(BeEmpty())
+			Expect(cleaned).To(BeEmpty())
+		})
+
+		It("should handle only closing tag", func() {
+			content := "</think>only content"
+			reasoning, cleaned := Extract(content, WithThinkingForcedOpen())
+			Expect(reasoning).To(BeEmpty())
+			Expect(cleaned).To(Equal("only content"))
+		})
+
+		It("should find earliest closing tag", func() {
+			// </think> comes before </thinking>
+			content := "Reasoning</think>content</thinking>more"
+			reasoning, cleaned := Extract(content, WithThinkingForcedOpen())
+			Expect(reasoning).To(Equal("Reasoning"))
+			Expect(cleaned).To(Equal("content</thinking>more"))
+		})
+
+		It("should handle Command R7B closing tag", func() {
+			content := "Reasoning content<|END_THINKING|>actual response"
+			reasoning, cleaned := Extract(content, WithThinkingForcedOpen())
+			Expect(reasoning).To(Equal("Reasoning content"))
+			Expect(cleaned).To(Equal("actual response"))
+		})
+
+		It("should handle Apertus closing tag", func() {
+			content := "Reasoning content<|inner_suffix|>actual response"
+			reasoning, cleaned := Extract(content, WithThinkingForcedOpen())
+			Expect(reasoning).To(Equal("Reasoning content"))
+			Expect(cleaned).To(Equal("actual response"))
+		})
+
+		It("should handle Seed closing tag", func() {
+			content := "Reasoning content</seed:think>actual response"
+			reasoning, cleaned := Extract(content, WithThinkingForcedOpen())
+			Expect(reasoning).To(Equal("Reasoning content"))
+			Expect(cleaned).To(Equal("actual response"))
+		})
+
+		It("should handle Magistral closing tag", func() {
+			content := "Reasoning content[/THINK]actual response"
+			reasoning, cleaned := Extract(content, WithThinkingForcedOpen())
+			Expect(reasoning).To(Equal("Reasoning content"))
+			Expect(cleaned).To(Equal("actual response"))
+		})
+	})
+
+	Context("with model-specific tag pairs", func() {
+		It("should extract Command R7B reasoning tags", func() {
+			content := "Before <|START_THINKING|>reasoning here<|END_THINKING|> After"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("reasoning here"))
+			Expect(cleaned).To(Equal("Before  After"))
+		})
+
+		It("should extract Apertus reasoning tags", func() {
+			content := "Before <|inner_prefix|>reasoning here<|inner_suffix|> After"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("reasoning here"))
+			Expect(cleaned).To(Equal("Before  After"))
+		})
+
+		It("should extract Seed reasoning tags", func() {
+			content := "Before <seed:think>reasoning here</seed:think> After"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("reasoning here"))
+			Expect(cleaned).To(Equal("Before  After"))
+		})
+
+		It("should extract Magistral reasoning tags", func() {
+			content := "Before [THINK]reasoning here[/THINK] After"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("reasoning here"))
+			Expect(cleaned).To(Equal("Before  After"))
+		})
+
+		It("should handle unclosed Command R7B tag", func() {
+			content := "Before <|START_THINKING|>reasoning still streaming"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("reasoning still streaming"))
+			Expect(cleaned).To(Equal("Before "))
+		})
+
+		It("should handle unclosed Apertus tag", func() {
+			content := "Before <|inner_prefix|>reasoning still streaming"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("reasoning still streaming"))
+			Expect(cleaned).To(Equal("Before "))
+		})
+
+		It("should handle unclosed Seed tag", func() {
+			content := "Before <seed:think>reasoning still streaming"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("reasoning still streaming"))
+			Expect(cleaned).To(Equal("Before "))
+		})
+
+		It("should handle unclosed Magistral tag", func() {
+			content := "Before [THINK]reasoning still streaming"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("reasoning still streaming"))
+			Expect(cleaned).To(Equal("Before "))
+		})
+
+		It("should handle closing-only Command R7B tag", func() {
+			content := "Reasoning content<|END_THINKING|>actual response"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Reasoning content"))
+			Expect(cleaned).To(Equal("actual response"))
+		})
+
+		It("should handle closing-only Apertus tag", func() {
+			content := "Reasoning content<|inner_suffix|>actual response"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Reasoning content"))
+			Expect(cleaned).To(Equal("actual response"))
+		})
+
+		It("should handle closing-only Seed tag", func() {
+			content := "Reasoning content</seed:think>actual response"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Reasoning content"))
+			Expect(cleaned).To(Equal("actual response"))
+		})
+
+		It("should handle closing-only Magistral tag", func() {
+			content := "Reasoning content[/THINK]actual response"
+			reasoning, cleaned := Extract(content)
+			Expect(reasoning).To(Equal("Reasoning content"))
+			Expect(cleaned).To(Equal("actual response"))
+		})
+	})
+})
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -1259,6 +1259,116 @@ const docTemplate = `{
                }
            }
        },
+        "/v1/responses": {
+            "post": {
+                "summary": "Create a response using the Open Responses API",
+                "parameters": [
+                    {
+                        "description": "Request body",
+                        "name": "request",
+                        "in": "body",
+                        "required": true,
+                        "schema": {
+                            "$ref": "#/definitions/schema.OpenResponsesRequest"
+                        }
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "Response",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ORResponseResource"
+                        }
+                    }
+                }
+            }
+        },
+        "/v1/responses/{id}": {
+            "get": {
+                "description": "Retrieve a response by ID. Can be used for polling background responses or resuming streaming responses.",
+                "summary": "Get a response by ID",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "Response ID",
+                        "name": "id",
+                        "in": "path",
+                        "required": true
+                    },
+                    {
+                        "type": "string",
+                        "description": "Set to 'true' to resume streaming",
+                        "name": "stream",
+                        "in": "query"
+                    },
+                    {
+                        "type": "integer",
+                        "description": "Sequence number to resume from (for streaming)",
+                        "name": "starting_after",
+                        "in": "query"
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "Response",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ORResponseResource"
+                        }
+                    },
+                    "400": {
+                        "description": "Bad Request",
+                        "schema": {
+                            "type": "object",
+                            "additionalProperties": true
+                        }
+                    },
+                    "404": {
+                        "description": "Not Found",
+                        "schema": {
+                            "type": "object",
+                            "additionalProperties": true
+                        }
+                    }
+                }
+            }
+        },
+        "/v1/responses/{id}/cancel": {
+            "post": {
+                "description": "Cancel a background response if it's still in progress",
+                "summary": "Cancel a response",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "Response ID",
+                        "name": "id",
+                        "in": "path",
+                        "required": true
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "Response",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ORResponseResource"
+                        }
+                    },
+                    "400": {
+                        "description": "Bad Request",
+                        "schema": {
+                            "type": "object",
+                            "additionalProperties": true
+                        }
+                    },
+                    "404": {
+                        "description": "Not Found",
+                        "schema": {
+                            "type": "object",
+                            "additionalProperties": true
+                        }
+                    }
+                }
+            }
+        },
        "/v1/sound-generation": {
            "post": {
                "summary": "Generates audio from the input text.",
@@ -2507,6 +2617,322 @@ const docTemplate = `{
                }
            }
        },
+        "schema.ORError": {
+            "type": "object",
+            "properties": {
+                "code": {
+                    "type": "string"
+                },
+                "message": {
+                    "type": "string"
+                },
+                "param": {
+                    "type": "string"
+                },
+                "type": {
+                    "description": "invalid_request|not_found|server_error|model_error|too_many_requests",
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ORFunctionTool": {
+            "type": "object",
+            "properties": {
+                "description": {
+                    "type": "string"
+                },
+                "name": {
+                    "type": "string"
+                },
+                "parameters": {
+                    "type": "object",
+                    "additionalProperties": true
+                },
+                "strict": {
+                    "description": "Always include in response",
+                    "type": "boolean"
+                },
+                "type": {
+                    "description": "always \"function\"",
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ORIncompleteDetails": {
+            "type": "object",
+            "properties": {
+                "reason": {
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ORInputTokensDetails": {
+            "type": "object",
+            "properties": {
+                "cached_tokens": {
+                    "description": "Always include, even if 0",
+                    "type": "integer"
+                }
+            }
+        },
+        "schema.ORItemField": {
+            "type": "object",
+            "properties": {
+                "arguments": {
+                    "type": "string"
+                },
+                "call_id": {
+                    "description": "Function call fields",
+                    "type": "string"
+                },
+                "content": {
+                    "description": "string or []ORContentPart for messages"
+                },
+                "id": {
+                    "description": "Present for all output items",
+                    "type": "string"
+                },
+                "name": {
+                    "type": "string"
+                },
+                "output": {
+                    "description": "Function call output fields"
+                },
+                "role": {
+                    "description": "Message fields",
+                    "type": "string"
+                },
+                "status": {
+                    "description": "in_progress|completed|incomplete",
+                    "type": "string"
+                },
+                "type": {
+                    "description": "message|function_call|function_call_output|reasoning|item_reference",
+                    "type": "string"
+                }
+            }
+        },
+        "schema.OROutputTokensDetails": {
+            "type": "object",
+            "properties": {
+                "reasoning_tokens": {
+                    "description": "Always include, even if 0",
+                    "type": "integer"
+                }
+            }
+        },
+        "schema.ORReasoning": {
+            "type": "object",
+            "properties": {
+                "effort": {
+                    "type": "string"
+                },
+                "summary": {
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ORReasoningParam": {
+            "type": "object",
+            "properties": {
+                "effort": {
+                    "description": "\"none\"|\"low\"|\"medium\"|\"high\"|\"xhigh\"",
+                    "type": "string"
+                },
+                "summary": {
+                    "description": "\"auto\"|\"concise\"|\"detailed\"",
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ORResponseResource": {
+            "type": "object",
+            "properties": {
+                "background": {
+                    "type": "boolean"
+                },
+                "completed_at": {
+                    "description": "Required: present as number or null",
+                    "type": "integer"
+                },
+                "created_at": {
+                    "type": "integer"
+                },
+                "error": {
+                    "description": "Always present, null if no error",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.ORError"
+                        }
+                    ]
+                },
+                "frequency_penalty": {
+                    "type": "number"
+                },
+                "id": {
+                    "type": "string"
+                },
+                "incomplete_details": {
+                    "description": "Always present, null if complete",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.ORIncompleteDetails"
+                        }
+                    ]
+                },
+                "instructions": {
+                    "type": "string"
+                },
+                "max_output_tokens": {
+                    "type": "integer"
+                },
+                "max_tool_calls": {
+                    "description": "nullable",
+                    "type": "integer"
+                },
+                "metadata": {
+                    "description": "Metadata and operational flags",
+                    "type": "object",
+                    "additionalProperties": {
+                        "type": "string"
+                    }
+                },
+                "model": {
+                    "type": "string"
+                },
+                "object": {
+                    "description": "always \"response\"",
+                    "type": "string"
+                },
+                "output": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/schema.ORItemField"
+                    }
+                },
+                "parallel_tool_calls": {
+                    "type": "boolean"
+                },
+                "presence_penalty": {
+                    "type": "number"
+                },
+                "previous_response_id": {
+                    "type": "string"
+                },
+                "prompt_cache_key": {
+                    "description": "nullable",
+                    "type": "string"
+                },
+                "reasoning": {
+                    "description": "nullable",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.ORReasoning"
+                        }
+                    ]
+                },
+                "safety_identifier": {
+                    "description": "Safety and caching",
+                    "type": "string"
+                },
+                "service_tier": {
+                    "type": "string"
+                },
+                "status": {
+                    "description": "in_progress|completed|failed|incomplete",
+                    "type": "string"
+                },
+                "store": {
+                    "type": "boolean"
+                },
+                "temperature": {
+                    "description": "Sampling parameters (always required)",
+                    "type": "number"
+                },
+                "text": {
+                    "description": "Text format configuration",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.ORTextConfig"
+                        }
+                    ]
+                },
+                "tool_choice": {},
+                "tools": {
+                    "description": "Tool-related fields",
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/schema.ORFunctionTool"
+                    }
+                },
+                "top_logprobs": {
+                    "description": "Default to 0",
+                    "type": "integer"
+                },
+                "top_p": {
+                    "type": "number"
+                },
+                "truncation": {
+                    "description": "Truncation and reasoning",
+                    "type": "string"
+                },
+                "usage": {
+                    "description": "Usage statistics",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.ORUsage"
+                        }
+                    ]
+                }
+            }
+        },
+        "schema.ORTextConfig": {
+            "type": "object",
+            "properties": {
+                "format": {
+                    "$ref": "#/definitions/schema.ORTextFormat"
+                }
+            }
+        },
+        "schema.ORTextFormat": {
+            "type": "object",
+            "properties": {
+                "type": {
+                    "description": "\"text\" or \"json_schema\"",
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ORUsage": {
+            "type": "object",
+            "properties": {
+                "input_tokens": {
+                    "type": "integer"
+                },
+                "input_tokens_details": {
+                    "description": "Always present",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.ORInputTokensDetails"
+                        }
+                    ]
+                },
+                "output_tokens": {
+                    "type": "integer"
+                },
+                "output_tokens_details": {
+                    "description": "Always present",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.OROutputTokensDetails"
+                        }
+                    ]
+                },
+                "total_tokens": {
+                    "type": "integer"
+                }
+            }
+        },
        "schema.OpenAIModel": {
            "type": "object",
            "properties": {
@@ -2781,6 +3207,114 @@ const docTemplate = `{
                }
            }
        },
+        "schema.OpenResponsesRequest": {
+            "type": "object",
+            "properties": {
+                "allowed_tools": {
+                    "description": "Restrict which tools can be invoked",
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "background": {
+                    "description": "Run request in background",
+                    "type": "boolean"
+                },
+                "frequency_penalty": {
+                    "description": "Frequency penalty (-2.0 to 2.0)",
+                    "type": "number"
+                },
+                "include": {
+                    "description": "What to include in response",
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "input": {
+                    "description": "string or []ORItemParam"
+                },
+                "instructions": {
+                    "type": "string"
+                },
+                "logit_bias": {
+                    "description": "OpenAI-compatible extensions (not in Open Responses spec)",
+                    "type": "object",
+                    "additionalProperties": {
+                        "type": "number",
+                        "format": "float64"
+                    }
+                },
+                "max_output_tokens": {
+                    "type": "integer"
+                },
+                "max_tool_calls": {
+                    "description": "Maximum number of tool calls",
+                    "type": "integer"
+                },
+                "metadata": {
+                    "type": "object",
+                    "additionalProperties": {
+                        "type": "string"
+                    }
+                },
+                "model": {
+                    "type": "string"
+                },
+                "parallel_tool_calls": {
+                    "description": "Allow parallel tool calls",
+                    "type": "boolean"
+                },
+                "presence_penalty": {
+                    "description": "Presence penalty (-2.0 to 2.0)",
+                    "type": "number"
+                },
+                "previous_response_id": {
+                    "type": "string"
+                },
+                "reasoning": {
+                    "$ref": "#/definitions/schema.ORReasoningParam"
+                },
+                "service_tier": {
+                    "description": "\"auto\"|\"default\"|priority hint",
+                    "type": "string"
+                },
+                "store": {
+                    "description": "Whether to store the response",
+                    "type": "boolean"
+                },
+                "stream": {
+                    "type": "boolean"
+                },
+                "temperature": {
+                    "type": "number"
+                },
+                "text_format": {
+                    "description": "Additional parameters from spec"
+                },
+                "tool_choice": {
+                    "description": "\"auto\"|\"required\"|\"none\"|{type:\"function\",name:\"...\"}"
+                },
+                "tools": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/schema.ORFunctionTool"
+                    }
+                },
+                "top_logprobs": {
+                    "description": "Number of top logprobs to return",
+                    "type": "integer"
+                },
+                "top_p": {
+                    "type": "number"
+                },
+                "truncation": {
+                    "description": "\"auto\"|\"disabled\"",
+                    "type": "string"
+                }
+            }
+        },
        "schema.P2PNodesResponse": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -1252,6 +1252,116 @@
                }
            }
        },
+        "/v1/responses": {
+            "post": {
+                "summary": "Create a response using the Open Responses API",
+                "parameters": [
+                    {
+                        "description": "Request body",
+                        "name": "request",
+                        "in": "body",
+                        "required": true,
+                        "schema": {
+                            "$ref": "#/definitions/schema.OpenResponsesRequest"
+                        }
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "Response",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ORResponseResource"
+                        }
+                    }
+                }
+            }
+        },
+        "/v1/responses/{id}": {
+            "get": {
+                "description": "Retrieve a response by ID. Can be used for polling background responses or resuming streaming responses.",
+                "summary": "Get a response by ID",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "Response ID",
+                        "name": "id",
+                        "in": "path",
+                        "required": true
+                    },
+                    {
+                        "type": "string",
+                        "description": "Set to 'true' to resume streaming",
+                        "name": "stream",
+                        "in": "query"
+                    },
+                    {
+                        "type": "integer",
+                        "description": "Sequence number to resume from (for streaming)",
+                        "name": "starting_after",
+                        "in": "query"
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "Response",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ORResponseResource"
+                        }
+                    },
+                    "400": {
+                        "description": "Bad Request",
+                        "schema": {
+                            "type": "object",
+                            "additionalProperties": true
+                        }
+                    },
+                    "404": {
+                        "description": "Not Found",
+                        "schema": {
+                            "type": "object",
+                            "additionalProperties": true
+                        }
+                    }
+                }
+            }
+        },
+        "/v1/responses/{id}/cancel": {
+            "post": {
+                "description": "Cancel a background response if it's still in progress",
+                "summary": "Cancel a response",
+                "parameters": [
+                    {
+                        "type": "string",
+                        "description": "Response ID",
+                        "name": "id",
+                        "in": "path",
+                        "required": true
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "Response",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ORResponseResource"
+                        }
+                    },
+                    "400": {
+                        "description": "Bad Request",
+                        "schema": {
+                            "type": "object",
+                            "additionalProperties": true
+                        }
+                    },
+                    "404": {
+                        "description": "Not Found",
+                        "schema": {
+                            "type": "object",
+                            "additionalProperties": true
+                        }
+                    }
+                }
+            }
+        },
        "/v1/sound-generation": {
            "post": {
                "summary": "Generates audio from the input text.",
@@ -2500,6 +2610,322 @@
                }
            }
        },
+        "schema.ORError": {
+            "type": "object",
+            "properties": {
+                "code": {
+                    "type": "string"
+                },
+                "message": {
+                    "type": "string"
+                },
+                "param": {
+                    "type": "string"
+                },
+                "type": {
+                    "description": "invalid_request|not_found|server_error|model_error|too_many_requests",
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ORFunctionTool": {
+            "type": "object",
+            "properties": {
+                "description": {
+                    "type": "string"
+                },
+                "name": {
+                    "type": "string"
+                },
+                "parameters": {
+                    "type": "object",
+                    "additionalProperties": true
+                },
+                "strict": {
+                    "description": "Always include in response",
+                    "type": "boolean"
+                },
+                "type": {
+                    "description": "always \"function\"",
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ORIncompleteDetails": {
+            "type": "object",
+            "properties": {
+                "reason": {
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ORInputTokensDetails": {
+            "type": "object",
+            "properties": {
+                "cached_tokens": {
+                    "description": "Always include, even if 0",
+                    "type": "integer"
+                }
+            }
+        },
+        "schema.ORItemField": {
+            "type": "object",
+            "properties": {
+                "arguments": {
+                    "type": "string"
+                },
+                "call_id": {
+                    "description": "Function call fields",
+                    "type": "string"
+                },
+                "content": {
+                    "description": "string or []ORContentPart for messages"
+                },
+                "id": {
+                    "description": "Present for all output items",
+                    "type": "string"
+                },
+                "name": {
+                    "type": "string"
+                },
+                "output": {
+                    "description": "Function call output fields"
+                },
+                "role": {
+                    "description": "Message fields",
+                    "type": "string"
+                },
+                "status": {
+                    "description": "in_progress|completed|incomplete",
+                    "type": "string"
+                },
+                "type": {
+                    "description": "message|function_call|function_call_output|reasoning|item_reference",
+                    "type": "string"
+                }
+            }
+        },
+        "schema.OROutputTokensDetails": {
+            "type": "object",
+            "properties": {
+                "reasoning_tokens": {
+                    "description": "Always include, even if 0",
+                    "type": "integer"
+                }
+            }
+        },
+        "schema.ORReasoning": {
+            "type": "object",
+            "properties": {
+                "effort": {
+                    "type": "string"
+                },
+                "summary": {
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ORReasoningParam": {
+            "type": "object",
+            "properties": {
+                "effort": {
+                    "description": "\"none\"|\"low\"|\"medium\"|\"high\"|\"xhigh\"",
+                    "type": "string"
+                },
+                "summary": {
+                    "description": "\"auto\"|\"concise\"|\"detailed\"",
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ORResponseResource": {
+            "type": "object",
+            "properties": {
+                "background": {
+                    "type": "boolean"
+                },
+                "completed_at": {
+                    "description": "Required: present as number or null",
+                    "type": "integer"
+                },
+                "created_at": {
+                    "type": "integer"
+                },
+                "error": {
+                    "description": "Always present, null if no error",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.ORError"
+                        }
+                    ]
+                },
+                "frequency_penalty": {
+                    "type": "number"
+                },
+                "id": {
+                    "type": "string"
+                },
+                "incomplete_details": {
+                    "description": "Always present, null if complete",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.ORIncompleteDetails"
+                        }
+                    ]
+                },
+                "instructions": {
+                    "type": "string"
+                },
+                "max_output_tokens": {
+                    "type": "integer"
+                },
+                "max_tool_calls": {
+                    "description": "nullable",
+                    "type": "integer"
+                },
+                "metadata": {
+                    "description": "Metadata and operational flags",
+                    "type": "object",
+                    "additionalProperties": {
+                        "type": "string"
+                    }
+                },
+                "model": {
+                    "type": "string"
+                },
+                "object": {
+                    "description": "always \"response\"",
+                    "type": "string"
+                },
+                "output": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/schema.ORItemField"
+                    }
+                },
+                "parallel_tool_calls": {
+                    "type": "boolean"
+                },
+                "presence_penalty": {
+                    "type": "number"
+                },
+                "previous_response_id": {
+                    "type": "string"
+                },
+                "prompt_cache_key": {
+                    "description": "nullable",
+                    "type": "string"
+                },
+                "reasoning": {
+                    "description": "nullable",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.ORReasoning"
+                        }
+                    ]
+                },
+                "safety_identifier": {
+                    "description": "Safety and caching",
+                    "type": "string"
+                },
+                "service_tier": {
+                    "type": "string"
+                },
+                "status": {
+                    "description": "in_progress|completed|failed|incomplete",
+                    "type": "string"
+                },
+                "store": {
+                    "type": "boolean"
+                },
+                "temperature": {
+                    "description": "Sampling parameters (always required)",
+                    "type": "number"
+                },
+                "text": {
+                    "description": "Text format configuration",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.ORTextConfig"
+                        }
+                    ]
+                },
+                "tool_choice": {},
+                "tools": {
+                    "description": "Tool-related fields",
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/schema.ORFunctionTool"
+                    }
+                },
+                "top_logprobs": {
+                    "description": "Default to 0",
+                    "type": "integer"
+                },
+                "top_p": {
+                    "type": "number"
+                },
+                "truncation": {
+                    "description": "Truncation and reasoning",
+                    "type": "string"
+                },
+                "usage": {
+                    "description": "Usage statistics",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.ORUsage"
+                        }
+                    ]
+                }
+            }
+        },
+        "schema.ORTextConfig": {
+            "type": "object",
+            "properties": {
+                "format": {
+                    "$ref": "#/definitions/schema.ORTextFormat"
+                }
+            }
+        },
+        "schema.ORTextFormat": {
+            "type": "object",
+            "properties": {
+                "type": {
+                    "description": "\"text\" or \"json_schema\"",
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ORUsage": {
+            "type": "object",
+            "properties": {
+                "input_tokens": {
+                    "type": "integer"
+                },
+                "input_tokens_details": {
+                    "description": "Always present",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.ORInputTokensDetails"
+                        }
+                    ]
+                },
+                "output_tokens": {
+                    "type": "integer"
+                },
+                "output_tokens_details": {
+                    "description": "Always present",
+                    "allOf": [
+                        {
+                            "$ref": "#/definitions/schema.OROutputTokensDetails"
+                        }
+                    ]
+                },
+                "total_tokens": {
+                    "type": "integer"
+                }
+            }
+        },
        "schema.OpenAIModel": {
            "type": "object",
            "properties": {
@@ -2774,6 +3200,114 @@
                }
            }
        },
+        "schema.OpenResponsesRequest": {
+            "type": "object",
+            "properties": {
+                "allowed_tools": {
+                    "description": "Restrict which tools can be invoked",
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "background": {
+                    "description": "Run request in background",
+                    "type": "boolean"
+                },
+                "frequency_penalty": {
+                    "description": "Frequency penalty (-2.0 to 2.0)",
+                    "type": "number"
+                },
+                "include": {
+                    "description": "What to include in response",
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "input": {
+                    "description": "string or []ORItemParam"
+                },
+                "instructions": {
+                    "type": "string"
+                },
+                "logit_bias": {
+                    "description": "OpenAI-compatible extensions (not in Open Responses spec)",
+                    "type": "object",
+                    "additionalProperties": {
+                        "type": "number",
+                        "format": "float64"
+                    }
+                },
+                "max_output_tokens": {
+                    "type": "integer"
+                },
+                "max_tool_calls": {
+                    "description": "Maximum number of tool calls",
+                    "type": "integer"
+                },
+                "metadata": {
+                    "type": "object",
+                    "additionalProperties": {
+                        "type": "string"
+                    }
+                },
+                "model": {
+                    "type": "string"
+                },
+                "parallel_tool_calls": {
+                    "description": "Allow parallel tool calls",
+                    "type": "boolean"
+                },
+                "presence_penalty": {
+                    "description": "Presence penalty (-2.0 to 2.0)",
+                    "type": "number"
+                },
+                "previous_response_id": {
+                    "type": "string"
+                },
+                "reasoning": {
+                    "$ref": "#/definitions/schema.ORReasoningParam"
+                },
+                "service_tier": {
+                    "description": "\"auto\"|\"default\"|priority hint",
+                    "type": "string"
+                },
+                "store": {
+                    "description": "Whether to store the response",
+                    "type": "boolean"
+                },
+                "stream": {
+                    "type": "boolean"
+                },
+                "temperature": {
+                    "type": "number"
+                },
+                "text_format": {
+                    "description": "Additional parameters from spec"
+                },
+                "tool_choice": {
+                    "description": "\"auto\"|\"required\"|\"none\"|{type:\"function\",name:\"...\"}"
+                },
+                "tools": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/schema.ORFunctionTool"
+                    }
+                },
+                "top_logprobs": {
+                    "description": "Number of top logprobs to return",
+                    "type": "integer"
+                },
+                "top_p": {
+                    "type": "number"
+                },
+                "truncation": {
+                    "description": "\"auto\"|\"disabled\"",
+                    "type": "string"
+                }
+            }
+        },
        "schema.P2PNodesResponse": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -742,6 +742,212 @@ definitions:
      tunnelAddress:
        type: string
    type: object
+  schema.ORError:
+    properties:
+      code:
+        type: string
+      message:
+        type: string
+      param:
+        type: string
+      type:
+        description: invalid_request|not_found|server_error|model_error|too_many_requests
+        type: string
+    type: object
+  schema.ORFunctionTool:
+    properties:
+      description:
+        type: string
+      name:
+        type: string
+      parameters:
+        additionalProperties: true
+        type: object
+      strict:
+        description: Always include in response
+        type: boolean
+      type:
+        description: always "function"
+        type: string
+    type: object
+  schema.ORIncompleteDetails:
+    properties:
+      reason:
+        type: string
+    type: object
+  schema.ORInputTokensDetails:
+    properties:
+      cached_tokens:
+        description: Always include, even if 0
+        type: integer
+    type: object
+  schema.ORItemField:
+    properties:
+      arguments:
+        type: string
+      call_id:
+        description: Function call fields
+        type: string
+      content:
+        description: string or []ORContentPart for messages
+      id:
+        description: Present for all output items
+        type: string
+      name:
+        type: string
+      output:
+        description: Function call output fields
+      role:
+        description: Message fields
+        type: string
+      status:
+        description: in_progress|completed|incomplete
+        type: string
+      type:
+        description: message|function_call|function_call_output|reasoning|item_reference
+        type: string
+    type: object
+  schema.OROutputTokensDetails:
+    properties:
+      reasoning_tokens:
+        description: Always include, even if 0
+        type: integer
+    type: object
+  schema.ORReasoning:
+    properties:
+      effort:
+        type: string
+      summary:
+        type: string
+    type: object
+  schema.ORReasoningParam:
+    properties:
+      effort:
+        description: '"none"|"low"|"medium"|"high"|"xhigh"'
+        type: string
+      summary:
+        description: '"auto"|"concise"|"detailed"'
+        type: string
+    type: object
+  schema.ORResponseResource:
+    properties:
+      background:
+        type: boolean
+      completed_at:
+        description: 'Required: present as number or null'
+        type: integer
+      created_at:
+        type: integer
+      error:
+        allOf:
+        - $ref: '#/definitions/schema.ORError'
+        description: Always present, null if no error
+      frequency_penalty:
+        type: number
+      id:
+        type: string
+      incomplete_details:
+        allOf:
+        - $ref: '#/definitions/schema.ORIncompleteDetails'
+        description: Always present, null if complete
+      instructions:
+        type: string
+      max_output_tokens:
+        type: integer
+      max_tool_calls:
+        description: nullable
+        type: integer
+      metadata:
+        additionalProperties:
+          type: string
+        description: Metadata and operational flags
+        type: object
+      model:
+        type: string
+      object:
+        description: always "response"
+        type: string
+      output:
+        items:
+          $ref: '#/definitions/schema.ORItemField'
+        type: array
+      parallel_tool_calls:
+        type: boolean
+      presence_penalty:
+        type: number
+      previous_response_id:
+        type: string
+      prompt_cache_key:
+        description: nullable
+        type: string
+      reasoning:
+        allOf:
+        - $ref: '#/definitions/schema.ORReasoning'
+        description: nullable
+      safety_identifier:
+        description: Safety and caching
+        type: string
+      service_tier:
+        type: string
+      status:
+        description: in_progress|completed|failed|incomplete
+        type: string
+      store:
+        type: boolean
+      temperature:
+        description: Sampling parameters (always required)
+        type: number
+      text:
+        allOf:
+        - $ref: '#/definitions/schema.ORTextConfig'
+        description: Text format configuration
+      tool_choice: {}
+      tools:
+        description: Tool-related fields
+        items:
+          $ref: '#/definitions/schema.ORFunctionTool'
+        type: array
+      top_logprobs:
+        description: Default to 0
+        type: integer
+      top_p:
+        type: number
+      truncation:
+        description: Truncation and reasoning
+        type: string
+      usage:
+        allOf:
+        - $ref: '#/definitions/schema.ORUsage'
+        description: Usage statistics
+    type: object
+  schema.ORTextConfig:
+    properties:
+      format:
+        $ref: '#/definitions/schema.ORTextFormat'
+    type: object
+  schema.ORTextFormat:
+    properties:
+      type:
+        description: '"text" or "json_schema"'
+        type: string
+    type: object
+  schema.ORUsage:
+    properties:
+      input_tokens:
+        type: integer
+      input_tokens_details:
+        allOf:
+        - $ref: '#/definitions/schema.ORInputTokensDetails'
+        description: Always present
+      output_tokens:
+        type: integer
+      output_tokens_details:
+        allOf:
+        - $ref: '#/definitions/schema.OROutputTokensDetails'
+        description: Always present
+      total_tokens:
+        type: integer
+    type: object
  schema.OpenAIModel:
    properties:
      id:
@@ -936,6 +1142,82 @@ definitions:
      total_tokens:
        type: integer
    type: object
+  schema.OpenResponsesRequest:
+    properties:
+      allowed_tools:
+        description: Restrict which tools can be invoked
+        items:
+          type: string
+        type: array
+      background:
+        description: Run request in background
+        type: boolean
+      frequency_penalty:
+        description: Frequency penalty (-2.0 to 2.0)
+        type: number
+      include:
+        description: What to include in response
+        items:
+          type: string
+        type: array
+      input:
+        description: string or []ORItemParam
+      instructions:
+        type: string
+      logit_bias:
+        additionalProperties:
+          format: float64
+          type: number
+        description: OpenAI-compatible extensions (not in Open Responses spec)
+        type: object
+      max_output_tokens:
+        type: integer
+      max_tool_calls:
+        description: Maximum number of tool calls
+        type: integer
+      metadata:
+        additionalProperties:
+          type: string
+        type: object
+      model:
+        type: string
+      parallel_tool_calls:
+        description: Allow parallel tool calls
+        type: boolean
+      presence_penalty:
+        description: Presence penalty (-2.0 to 2.0)
+        type: number
+      previous_response_id:
+        type: string
+      reasoning:
+        $ref: '#/definitions/schema.ORReasoningParam'
+      service_tier:
+        description: '"auto"|"default"|priority hint'
+        type: string
+      store:
+        description: Whether to store the response
+        type: boolean
+      stream:
+        type: boolean
+      temperature:
+        type: number
+      text_format:
+        description: Additional parameters from spec
+      tool_choice:
+        description: '"auto"|"required"|"none"|{type:"function",name:"..."}'
+      tools:
+        items:
+          $ref: '#/definitions/schema.ORFunctionTool'
+        type: array
+      top_logprobs:
+        description: Number of top logprobs to return
+        type: integer
+      top_p:
+        type: number
+      truncation:
+        description: '"auto"|"disabled"'
+        type: string
+    type: object
  schema.P2PNodesResponse:
    properties:
      federated_nodes:
@@ -1962,6 +2244,80 @@ paths:
          schema:
            $ref: '#/definitions/schema.JINARerankResponse'
      summary: Reranks a list of phrases by relevance to a given text query.
+  /v1/responses:
+    post:
+      parameters:
+      - description: Request body
+        in: body
+        name: request
+        required: true
+        schema:
+          $ref: '#/definitions/schema.OpenResponsesRequest'
+      responses:
+        "200":
+          description: Response
+          schema:
+            $ref: '#/definitions/schema.ORResponseResource'
+      summary: Create a response using the Open Responses API
+  /v1/responses/{id}:
+    get:
+      description: Retrieve a response by ID. Can be used for polling background responses
+        or resuming streaming responses.
+      parameters:
+      - description: Response ID
+        in: path
+        name: id
+        required: true
+        type: string
+      - description: Set to 'true' to resume streaming
+        in: query
+        name: stream
+        type: string
+      - description: Sequence number to resume from (for streaming)
+        in: query
+        name: starting_after
+        type: integer
+      responses:
+        "200":
+          description: Response
+          schema:
+            $ref: '#/definitions/schema.ORResponseResource'
+        "400":
+          description: Bad Request
+          schema:
+            additionalProperties: true
+            type: object
+        "404":
+          description: Not Found
+          schema:
+            additionalProperties: true
+            type: object
+      summary: Get a response by ID
+  /v1/responses/{id}/cancel:
+    post:
+      description: Cancel a background response if it's still in progress
+      parameters:
+      - description: Response ID
+        in: path
+        name: id
+        required: true
+        type: string
+      responses:
+        "200":
+          description: Response
+          schema:
+            $ref: '#/definitions/schema.ORResponseResource'
+        "400":
+          description: Bad Request
+          schema:
+            additionalProperties: true
+            type: object
+        "404":
+          description: Not Found
+          schema:
+            additionalProperties: true
+            type: object
+      summary: Cancel a response
  /v1/sound-generation:
    post:
      parameters:
Author	SHA1	Message	Date
Ettore Di Giacinto	61a6e95f7d	Additional thinking tags Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-01-20 12:02:35 +01:00
Ettore Di Giacinto	a352125726	chore: refactorings Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-01-20 11:48:00 +01:00
Ettore Di Giacinto	187e474daf	fix(reasoning): handle only closing tags Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-01-20 11:40:29 +01:00
Ettore Di Giacinto	4bf2f8bbd8	chore(docs): update docs with Anthropic API and openresponses Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-01-20 09:25:24 +01:00
LocalAI [bot]	d3525b7509	chore: ⬆️ Update ggml-org/llama.cpp to `959ecf7f234dc0bc0cd6829b25cb0ee1481aa78a` (#8122 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-01-19 22:50:47 +01:00
LocalAI [bot]	c8aa821e0e	chore: ⬆️ Update leejet/stable-diffusion.cpp to `a48b4a3ade9972faf0adcad47e51c6fc03f0e46d` (#8121 ) ⬆️ Update leejet/stable-diffusion.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-01-19 22:27:46 +01:00
dependabot[bot]	b3191927ae	chore(deps): bump github.com/mudler/cogito from 0.7.2 to 0.8.1 (#8124 ) Bumps [github.com/mudler/cogito](https://github.com/mudler/cogito) from 0.7.2 to 0.8.1. - [Release notes](https://github.com/mudler/cogito/releases) - [Commits](https://github.com/mudler/cogito/compare/v0.7.2...v0.8.1) --- updated-dependencies: - dependency-name: github.com/mudler/cogito dependency-version: 0.8.1 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2026-01-19 22:26:26 +01:00
LocalAI [bot]	54c5a2d9ea	docs: ⬆️ update docs version mudler/LocalAI (#8120 ) ⬆️ Update docs version mudler/LocalAI Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-01-19 21:18:24 +00:00
Ettore Di Giacinto	0279591fec	Enable reranking for Qwen3-VL-Reranker-8B Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2026-01-19 15:28:58 +01:00
LocalAI [bot]	8845186955	chore: ⬆️ Update leejet/stable-diffusion.cpp to `2efd19978dd4164e387bf226025c9666b6ef35e2` (#8099 ) ⬆️ Update leejet/stable-diffusion.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-01-18 22:40:35 +01:00
LocalAI [bot]	ab8ed24358	chore: ⬆️ Update ggml-org/llama.cpp to `287a33017b32600bfc0e81feeb0ad6e81e0dd484` (#8100 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-01-18 22:40:14 +01:00
LocalAI [bot]	a021df5a88	feat(swagger): update swagger (#8098 ) Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-01-18 22:10:06 +01:00