fix(mlx): strip file:// LocalPrefix before loading filesystem-imported models

MLX backends passed request.Model verbatim to mlx_lm/mlx_vlm load(). For a model imported from the filesystem, LocalAI hands the backend a file:// URI (its LocalPrefix), which load() rejects: the scheme is neither a valid HF repo id nor an existing path (Path(model).exists() fails on the scheme), producing "Repo id must be in the form 'repo_name' or 'namespace/repo_name' ... Use repo_type argument if needed". Add a pure, unit-testable resolve_model_path(model, model_file) helper in the shared python_utils: it prefers the resolved ModelFile, strips a file:// scheme and percent-decodes the path, and leaves plain repo ids and local paths untouched. Wire it into the mlx, mlx-vlm and mlx-distributed backends (load, model_key, and the distributed broadcast all use the normalized path). Fixes #7461. Assisted-by: claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-12 18:58:49 -04:00 · 2026-06-12 22:07:06 +00:00
8 changed files with 316 additions and 487 deletions
--- a/backend/python/common/python_utils.py
+++ b/backend/python/common/python_utils.py
@@ -5,6 +5,31 @@ imported by any backend that needs to parse LocalAI gRPC options or build a
 chat-template-compatible message list from proto Message objects.
 """
 import json
+from urllib.parse import unquote
+
+
+def resolve_model_path(model, model_file=""):
+    """Resolve a LocalAI model reference to something an HF/MLX loader accepts.
+
+    LocalAI hands backends either a plain HuggingFace repo id
+    (``namespace/name``), an already-local filesystem path, or a
+    ``file://`` URI (its ``LocalPrefix``) for models imported from disk.
+    Loaders such as ``mlx_lm.load`` reject the ``file://`` form because the
+    scheme is neither a valid repo id nor an existing path, so we normalize
+    it here before loading.
+
+    Resolution order:
+      1. Prefer ``model_file`` when set and non-empty - that is the resolved
+         local path LocalAI computed for the model.
+      2. Strip a ``file://`` scheme and percent-decode it to a plain path.
+      3. Leave plain repo ids and already-local paths unchanged.
+    """
+    candidate = model_file if model_file else model
+    if candidate is None:
+        return candidate
+    if candidate.startswith("file://"):
+        return unquote(candidate[len("file://"):])
+    return candidate


 def parse_options(options_list):
--- a/backend/python/mlx-distributed/backend.py
+++ b/backend/python/mlx-distributed/backend.py
@@ -28,7 +28,7 @@ import grpc
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
 from grpc_auth import get_auth_interceptors
-from python_utils import messages_to_dicts, parse_options as _shared_parse_options
+from python_utils import messages_to_dicts, parse_options as _shared_parse_options, resolve_model_path
 from mlx_utils import parse_tool_calls, split_reasoning


@@ -99,7 +99,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            from mlx_lm import load
            from mlx_lm.models.cache import make_prompt_cache, can_trim_prompt_cache, trim_prompt_cache

-            print(f"[Rank 0] Loading model: {request.Model}", file=sys.stderr)
+            # Normalize the model reference: strip LocalAI's file:// LocalPrefix
+            # and prefer the resolved ModelFile so mlx_lm.load() gets a plain
+            # repo id or filesystem path (it rejects file:// URIs).
+            model_path = resolve_model_path(request.Model, request.ModelFile)
+            print(f"[Rank 0] Loading model: {model_path}", file=sys.stderr)

            self.options = parse_options(request.Options)
            print(f"Options: {self.options}", file=sys.stderr)
@@ -128,7 +132,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                )
                self.coordinator = DistributedCoordinator(self.group)
                self.coordinator.broadcast_command(CMD_LOAD_MODEL)
-                self.coordinator.broadcast_model_name(request.Model)
+                self.coordinator.broadcast_model_name(model_path)
            else:
                print("[Rank 0] No hostfile configured, running single-node", file=sys.stderr)

@@ -144,9 +148,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

            if tokenizer_config:
                print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
-                self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
+                self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
            else:
-                self.model, self.tokenizer = load(request.Model)
+                self.model, self.tokenizer = load(model_path)

            if self.group is not None:
                from sharding import pipeline_auto_parallel
@@ -157,7 +161,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                from mlx_cache import ThreadSafeLRUPromptCache
                max_cache_entries = self.options.get("max_cache_entries", 10)
                self.max_kv_size = self.options.get("max_kv_size", None)
-                self.model_key = request.Model
+                self.model_key = model_path
                self.lru_cache = ThreadSafeLRUPromptCache(
                    max_size=max_cache_entries,
                    can_trim_fn=can_trim_prompt_cache,
--- a/backend/python/mlx-vlm/backend.py
+++ b/backend/python/mlx-vlm/backend.py
@@ -18,7 +18,7 @@ import grpc
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
 from grpc_auth import get_auth_interceptors
-from python_utils import messages_to_dicts, parse_options
+from python_utils import messages_to_dicts, parse_options, resolve_model_path
 from mlx_utils import parse_tool_calls, split_reasoning

 from mlx_vlm import load, stream_generate
@@ -67,7 +67,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            backend_pb2.Result: The load model result.
        """
        try:
-            print(f"Loading MLX-VLM model: {request.Model}", file=sys.stderr)
+            # Normalize the model reference: strip LocalAI's file:// LocalPrefix
+            # and prefer the resolved ModelFile so mlx_vlm.load() gets a plain
+            # repo id or filesystem path (it rejects file:// URIs).
+            model_path = resolve_model_path(request.Model, request.ModelFile)
+            print(f"Loading MLX-VLM model: {model_path}", file=sys.stderr)
            print(f"Request: {request}", file=sys.stderr)

            # Parse Options[] key:value strings into a typed dict
@@ -76,10 +80,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

            # Load model and processor using MLX-VLM
            # mlx-vlm load function returns (model, processor) instead of (model, tokenizer)
-            self.model, self.processor = load(request.Model)
+            self.model, self.processor = load(model_path)

            # Load model config for chat template support
-            self.config = load_config(request.Model)
+            self.config = load_config(model_path)

            # Auto-infer the tool parser from the chat template. mlx-vlm has
            # its own _infer_tool_parser that falls back to mlx-lm parsers.
--- a/backend/python/mlx/backend.py
+++ b/backend/python/mlx/backend.py
@@ -17,7 +17,7 @@ import grpc
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
 from grpc_auth import get_auth_interceptors
-from python_utils import messages_to_dicts, parse_options
+from python_utils import messages_to_dicts, parse_options, resolve_model_path
 from mlx_utils import parse_tool_calls, split_reasoning

 from mlx_lm import load, stream_generate
@@ -63,7 +63,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            backend_pb2.Result: The load model result.
        """
        try:
-            print(f"Loading MLX model: {request.Model}", file=sys.stderr)
+            # Normalize the model reference: strip LocalAI's file:// LocalPrefix
+            # and prefer the resolved ModelFile so mlx_lm.load() gets a plain
+            # repo id or filesystem path (it rejects file:// URIs).
+            model_path = resolve_model_path(request.Model, request.ModelFile)
+            print(f"Loading MLX model: {model_path}", file=sys.stderr)
            print(f"Request: {request}", file=sys.stderr)

            # Parse Options[] key:value strings into a typed dict (shared helper)
@@ -89,9 +93,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            # Load model and tokenizer using MLX
            if tokenizer_config:
                print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
-                self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
+                self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
            else:
-                self.model, self.tokenizer = load(request.Model)
+                self.model, self.tokenizer = load(model_path)

            # mlx_lm.load() returns a TokenizerWrapper that detects tool
            # calling and thinking markers from the chat template / vocab.
@@ -111,7 +115,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            # Initialize thread-safe LRU prompt cache for efficient generation
            max_cache_entries = self.options.get("max_cache_entries", 10)
            self.max_kv_size = self.options.get("max_kv_size", None)
-            self.model_key = request.Model
+            self.model_key = model_path
            self.lru_cache = ThreadSafeLRUPromptCache(
                max_size=max_cache_entries,
                can_trim_fn=can_trim_prompt_cache,
--- a/backend/python/mlx/test.py
+++ b/backend/python/mlx/test.py
@@ -12,7 +12,7 @@ import backend_pb2_grpc
 # Make the shared helpers importable so we can unit-test them without a
 # running gRPC server.
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
-from python_utils import messages_to_dicts, parse_options
+from python_utils import messages_to_dicts, parse_options, resolve_model_path
 from mlx_utils import parse_tool_calls, split_reasoning

 class TestBackendServicer(unittest.TestCase):
@@ -322,6 +322,42 @@ class TestSharedHelpers(unittest.TestCase):
        self.assertEqual(r, "")
        self.assertEqual(c, "just text")

+    def test_resolve_model_path_file_uri(self):
+        # file:// LocalPrefix (LocalAI import) is stripped to a plain path.
+        self.assertEqual(resolve_model_path("file:///a/b"), "/a/b")
+
+    def test_resolve_model_path_file_uri_percent_decoded(self):
+        # Percent-encoded characters (e.g. spaces) are decoded.
+        self.assertEqual(
+            resolve_model_path("file:///Users/me/My%20Models/Qwen3"),
+            "/Users/me/My Models/Qwen3",
+        )
+
+    def test_resolve_model_path_hf_repo_id_unchanged(self):
+        # Plain HuggingFace repo ids must pass through untouched.
+        self.assertEqual(
+            resolve_model_path("mlx-community/Qwen3-Coder-30B"),
+            "mlx-community/Qwen3-Coder-30B",
+        )
+
+    def test_resolve_model_path_local_path_unchanged(self):
+        # An already-local absolute path is left as-is.
+        self.assertEqual(resolve_model_path("/models/Qwen3"), "/models/Qwen3")
+
+    def test_resolve_model_path_prefers_model_file(self):
+        # The resolved ModelFile wins over Model when both are set.
+        self.assertEqual(
+            resolve_model_path("file:///ignored", "/resolved/local/path"),
+            "/resolved/local/path",
+        )
+
+    def test_resolve_model_path_model_file_file_uri(self):
+        # A ModelFile that is itself a file:// URI is also normalized.
+        self.assertEqual(
+            resolve_model_path("ignored", "file:///a/b"),
+            "/a/b",
+        )
+
    def test_parse_tool_calls_with_shim(self):
        tm = types.SimpleNamespace(
            tool_call_start="<tool_call>",
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -1648,12 +1648,6 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 	var currentReasoningContentIndex int
 	var reasoningTokens int
 	extractor := reason.NewReasoningExtractor(thinkingStartToken, cfg.ReasoningConfig)
-	// router classifies each streamed token into reasoning vs message deltas
-	// and decides which output item they target. It encapsulates the
-	// sticky-preferAutoparser fallback and the reasoningDelta-based gate that
-	// fix issue #9658 (live reasoning was mis-routed onto the msg_ item and
-	// only re-classified as a reasoning item after the stream completed).
-	router := newStreamReasoningRouter(extractor)

 	// Collect all output items for storage
 	var collectedOutputItems []schema.ORItemField
@@ -1677,7 +1671,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 				// Reset reasoning and tool-call state for re-inference so reasoning
 				// extraction runs again on subsequent iterations
 				inToolCallMode = false
-				router.resetForIteration()
+				extractor.Reset()
 				currentMessageID = ""
 				lastEmittedToolCallCount = 0
 				currentReasoningID = ""
@@ -1838,101 +1832,110 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6

 				// If no tool calls detected yet, handle reasoning and text
 				if !inToolCallMode {
-					routing := router.route(token, tokenUsage)
+					var reasoningDelta, contentDelta string
+					goReasoning, goContent := extractor.ProcessToken(token)

-					// Handle reasoning item. The reasoning item is opened lazily
-					// on the first reasoning delta - gating on routing, not
-					// extractor.Reasoning() (issue #9658): when the C++
-					// autoparser drives reasoning via reasoning_content,
-					// extractor.Reasoning() stays empty and the old gate dropped
-					// the live reasoning item.
-					if routing.OpenReasoningItem {
-						outputIndex++
-						currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
-						reasoningItem := &schema.ORItemField{
-							Type:   "reasoning",
-							ID:     currentReasoningID,
-							Status: "in_progress",
+					if tokenUsage.HasChatDeltaContent() {
+						rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
+						contentDelta = cd
+						reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
+					} else {
+						reasoningDelta = goReasoning
+						contentDelta = goContent
+					}
+
+					// Handle reasoning item
+					if extractor.Reasoning() != "" {
+						// Check if we need to create reasoning item
+						if currentReasoningID == "" {
+							outputIndex++
+							currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
+							reasoningItem := &schema.ORItemField{
+								Type:   "reasoning",
+								ID:     currentReasoningID,
+								Status: "in_progress",
+							}
+							sendSSEEvent(c, &schema.ORStreamEvent{
+								Type:           "response.output_item.added",
+								SequenceNumber: sequenceNumber,
+								OutputIndex:    &outputIndex,
+								Item:           reasoningItem,
+							})
+							sequenceNumber++
+
+							// Emit content_part.added for reasoning
+							currentReasoningContentIndex = 0
+							emptyPart := makeOutputTextPart("")
+							sendSSEEvent(c, &schema.ORStreamEvent{
+								Type:           "response.content_part.added",
+								SequenceNumber: sequenceNumber,
+								ItemID:         currentReasoningID,
+								OutputIndex:    &outputIndex,
+								ContentIndex:   &currentReasoningContentIndex,
+								Part:           &emptyPart,
+							})
+							sequenceNumber++
 						}
-						sendSSEEvent(c, &schema.ORStreamEvent{
-							Type:           "response.output_item.added",
-							SequenceNumber: sequenceNumber,
-							OutputIndex:    &outputIndex,
-							Item:           reasoningItem,
-						})
-						sequenceNumber++

-						// Emit content_part.added for reasoning
-						currentReasoningContentIndex = 0
-						emptyPart := makeOutputTextPart("")
-						sendSSEEvent(c, &schema.ORStreamEvent{
-							Type:           "response.content_part.added",
-							SequenceNumber: sequenceNumber,
-							ItemID:         currentReasoningID,
-							OutputIndex:    &outputIndex,
-							ContentIndex:   &currentReasoningContentIndex,
-							Part:           &emptyPart,
-						})
-						sequenceNumber++
-					}
-
-					// Emit reasoning delta against the reasoning_ item id.
-					if routing.ReasoningDelta != "" {
-						sendSSEEvent(c, &schema.ORStreamEvent{
-							Type:           "response.output_text.delta",
-							SequenceNumber: sequenceNumber,
-							ItemID:         currentReasoningID,
-							OutputIndex:    &outputIndex,
-							ContentIndex:   &currentReasoningContentIndex,
-							Delta:          strPtr(routing.ReasoningDelta),
-							Logprobs:       emptyLogprobs(),
-						})
-						sequenceNumber++
-						c.Response().Flush()
-					}
-
-					// Open the message item lazily on the first content delta.
-					if routing.OpenMessageItem {
-						outputIndex++
-						currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
-						messageItem := &schema.ORItemField{
-							Type:    "message",
-							ID:      currentMessageID,
-							Status:  "in_progress",
-							Role:    "assistant",
-							Content: []schema.ORContentPart{},
+						// Emit reasoning delta if there's new content
+						if reasoningDelta != "" {
+							sendSSEEvent(c, &schema.ORStreamEvent{
+								Type:           "response.output_text.delta",
+								SequenceNumber: sequenceNumber,
+								ItemID:         currentReasoningID,
+								OutputIndex:    &outputIndex,
+								ContentIndex:   &currentReasoningContentIndex,
+								Delta:          strPtr(reasoningDelta),
+								Logprobs:       emptyLogprobs(),
+							})
+							sequenceNumber++
+							c.Response().Flush()
 						}
-						sendSSEEvent(c, &schema.ORStreamEvent{
-							Type:           "response.output_item.added",
-							SequenceNumber: sequenceNumber,
-							OutputIndex:    &outputIndex,
-							Item:           messageItem,
-						})
-						sequenceNumber++
-
-						// Emit content_part.added
-						currentContentIndex = 0
-						emptyPart := makeOutputTextPart("")
-						sendSSEEvent(c, &schema.ORStreamEvent{
-							Type:           "response.content_part.added",
-							SequenceNumber: sequenceNumber,
-							ItemID:         currentMessageID,
-							OutputIndex:    &outputIndex,
-							ContentIndex:   &currentContentIndex,
-							Part:           &emptyPart,
-						})
-						sequenceNumber++
 					}

-					// Emit text delta against the msg_ item id.
-					if routing.ContentDelta != "" {
+					// Only emit message content if there's actual content (not just reasoning)
+					if contentDelta != "" {
+						if currentMessageID == "" {
+							// Emit output_item.added for message
+							outputIndex++
+							currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
+							messageItem := &schema.ORItemField{
+								Type:    "message",
+								ID:      currentMessageID,
+								Status:  "in_progress",
+								Role:    "assistant",
+								Content: []schema.ORContentPart{},
+							}
+							sendSSEEvent(c, &schema.ORStreamEvent{
+								Type:           "response.output_item.added",
+								SequenceNumber: sequenceNumber,
+								OutputIndex:    &outputIndex,
+								Item:           messageItem,
+							})
+							sequenceNumber++
+
+							// Emit content_part.added
+							currentContentIndex = 0
+							emptyPart := makeOutputTextPart("")
+							sendSSEEvent(c, &schema.ORStreamEvent{
+								Type:           "response.content_part.added",
+								SequenceNumber: sequenceNumber,
+								ItemID:         currentMessageID,
+								OutputIndex:    &outputIndex,
+								ContentIndex:   &currentContentIndex,
+								Part:           &emptyPart,
+							})
+							sequenceNumber++
+						}
+
+						// Emit text delta
 						sendSSEEvent(c, &schema.ORStreamEvent{
 							Type:           "response.output_text.delta",
 							SequenceNumber: sequenceNumber,
 							ItemID:         currentMessageID,
 							OutputIndex:    &outputIndex,
 							ContentIndex:   &currentContentIndex,
-							Delta:          strPtr(routing.ContentDelta),
+							Delta:          strPtr(contentDelta),
 							Logprobs:       emptyLogprobs(),
 						})
 						sequenceNumber++
@@ -2328,109 +2331,112 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 		return nil
 	}

-	// Non-tool-call streaming path.
-	//
-	// The message output item is created LAZILY on the first content delta
-	// (mirroring the tool-call path), not eagerly before the first token.
-	// Issue #9658: an eager msg_ item forced reasoning to a higher output
-	// index and made mis-split <think> text land on the pre-existing message,
-	// so the thinking monologue streamed as message text instead of reasoning.
-	var messageItem *schema.ORItemField
+	// Non-tool-call streaming path
+	// Emit output_item.added for message
+	currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
+	messageItem := &schema.ORItemField{
+		Type:    "message",
+		ID:      currentMessageID,
+		Status:  "in_progress",
+		Role:    "assistant",
+		Content: []schema.ORContentPart{},
+	}
+	sendSSEEvent(c, &schema.ORStreamEvent{
+		Type:           "response.output_item.added",
+		SequenceNumber: sequenceNumber,
+		OutputIndex:    &outputIndex,
+		Item:           messageItem,
+	})
+	sequenceNumber++
+
+	// Emit content_part.added
+	currentContentIndex = 0
+	emptyTextPart := makeOutputTextPart("")
+	sendSSEEvent(c, &schema.ORStreamEvent{
+		Type:           "response.content_part.added",
+		SequenceNumber: sequenceNumber,
+		ItemID:         currentMessageID,
+		OutputIndex:    &outputIndex,
+		ContentIndex:   &currentContentIndex,
+		Part:           &emptyTextPart,
+	})
+	sequenceNumber++

 	// Stream text deltas with reasoning extraction
 	tokenCallback := func(token string, tokenUsage backend.TokenUsage) bool {
 		accumulatedText += token

-		routing := router.route(token, tokenUsage)
+		var reasoningDelta, contentDelta string
+		goReasoning, goContent := extractor.ProcessToken(token)

-		// Open the reasoning item lazily on the first reasoning delta.
-		if routing.OpenReasoningItem {
-			outputIndex++
-			currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
-			reasoningItem := &schema.ORItemField{
-				Type:   "reasoning",
-				ID:     currentReasoningID,
-				Status: "in_progress",
+		if tokenUsage.HasChatDeltaContent() {
+			rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
+			contentDelta = cd
+			reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
+		} else {
+			reasoningDelta = goReasoning
+			contentDelta = goContent
+		}
+
+		// Handle reasoning item
+		if extractor.Reasoning() != "" {
+			// Check if we need to create reasoning item
+			if currentReasoningID == "" {
+				outputIndex++
+				currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
+				reasoningItem := &schema.ORItemField{
+					Type:   "reasoning",
+					ID:     currentReasoningID,
+					Status: "in_progress",
+				}
+				sendSSEEvent(c, &schema.ORStreamEvent{
+					Type:           "response.output_item.added",
+					SequenceNumber: sequenceNumber,
+					OutputIndex:    &outputIndex,
+					Item:           reasoningItem,
+				})
+				sequenceNumber++
+
+				// Emit content_part.added for reasoning
+				currentReasoningContentIndex = 0
+				emptyPart := makeOutputTextPart("")
+				sendSSEEvent(c, &schema.ORStreamEvent{
+					Type:           "response.content_part.added",
+					SequenceNumber: sequenceNumber,
+					ItemID:         currentReasoningID,
+					OutputIndex:    &outputIndex,
+					ContentIndex:   &currentReasoningContentIndex,
+					Part:           &emptyPart,
+				})
+				sequenceNumber++
 			}
-			sendSSEEvent(c, &schema.ORStreamEvent{
-				Type:           "response.output_item.added",
-				SequenceNumber: sequenceNumber,
-				OutputIndex:    &outputIndex,
-				Item:           reasoningItem,
-			})
-			sequenceNumber++

-			// Emit content_part.added for reasoning
-			currentReasoningContentIndex = 0
-			emptyPart := makeOutputTextPart("")
-			sendSSEEvent(c, &schema.ORStreamEvent{
-				Type:           "response.content_part.added",
-				SequenceNumber: sequenceNumber,
-				ItemID:         currentReasoningID,
-				OutputIndex:    &outputIndex,
-				ContentIndex:   &currentReasoningContentIndex,
-				Part:           &emptyPart,
-			})
-			sequenceNumber++
-		}
-
-		// Emit reasoning delta against the reasoning_ item id.
-		if routing.ReasoningDelta != "" {
-			sendSSEEvent(c, &schema.ORStreamEvent{
-				Type:           "response.output_text.delta",
-				SequenceNumber: sequenceNumber,
-				ItemID:         currentReasoningID,
-				OutputIndex:    &outputIndex,
-				ContentIndex:   &currentReasoningContentIndex,
-				Delta:          strPtr(routing.ReasoningDelta),
-				Logprobs:       emptyLogprobs(),
-			})
-			sequenceNumber++
-			c.Response().Flush()
-		}
-
-		// Open the message item lazily on the first content delta.
-		if routing.OpenMessageItem {
-			outputIndex++
-			currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
-			messageItem = &schema.ORItemField{
-				Type:    "message",
-				ID:      currentMessageID,
-				Status:  "in_progress",
-				Role:    "assistant",
-				Content: []schema.ORContentPart{},
+			// Emit reasoning delta if there's new content
+			if reasoningDelta != "" {
+				sendSSEEvent(c, &schema.ORStreamEvent{
+					Type:           "response.output_text.delta",
+					SequenceNumber: sequenceNumber,
+					ItemID:         currentReasoningID,
+					OutputIndex:    &outputIndex,
+					ContentIndex:   &currentReasoningContentIndex,
+					Delta:          strPtr(reasoningDelta),
+					Logprobs:       emptyLogprobs(),
+				})
+				sequenceNumber++
+				c.Response().Flush()
 			}
-			sendSSEEvent(c, &schema.ORStreamEvent{
-				Type:           "response.output_item.added",
-				SequenceNumber: sequenceNumber,
-				OutputIndex:    &outputIndex,
-				Item:           messageItem,
-			})
-			sequenceNumber++
-
-			// Emit content_part.added
-			currentContentIndex = 0
-			emptyTextPart := makeOutputTextPart("")
-			sendSSEEvent(c, &schema.ORStreamEvent{
-				Type:           "response.content_part.added",
-				SequenceNumber: sequenceNumber,
-				ItemID:         currentMessageID,
-				OutputIndex:    &outputIndex,
-				ContentIndex:   &currentContentIndex,
-				Part:           &emptyTextPart,
-			})
-			sequenceNumber++
 		}

-		// Emit text delta against the msg_ item id.
-		if routing.ContentDelta != "" {
+		// Only emit message content if there's actual content (not just reasoning)
+		if contentDelta != "" {
+			// Emit text delta
 			sendSSEEvent(c, &schema.ORStreamEvent{
 				Type:           "response.output_text.delta",
 				SequenceNumber: sequenceNumber,
 				ItemID:         currentMessageID,
 				OutputIndex:    &outputIndex,
 				ContentIndex:   &currentContentIndex,
-				Delta:          strPtr(routing.ContentDelta),
+				Delta:          strPtr(contentDelta),
 				Logprobs:       emptyLogprobs(),
 			})
 			sequenceNumber++
@@ -2555,78 +2561,40 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 	// Convert logprobs for streaming events
 	mcpStreamLogprobs := convertLogprobsForStreaming(noToolLogprobs)

-	// The message item is created lazily on the first content delta (issue
-	// #9658). If no content streamed but final extraction produced text (e.g.
-	// the autoparser delivered everything at once), open the message item now
-	// so the closing events below are valid. A pure-reasoning turn (no content
-	// at all) leaves messageItem nil and emits no message item.
-	if messageItem == nil && result != "" {
-		outputIndex++
-		currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
-		messageItem = &schema.ORItemField{
-			Type:    "message",
-			ID:      currentMessageID,
-			Status:  "in_progress",
-			Role:    "assistant",
-			Content: []schema.ORContentPart{},
-		}
-		sendSSEEvent(c, &schema.ORStreamEvent{
-			Type:           "response.output_item.added",
-			SequenceNumber: sequenceNumber,
-			OutputIndex:    &outputIndex,
-			Item:           messageItem,
-		})
-		sequenceNumber++
+	// Emit output_text.done
+	sendSSEEvent(c, &schema.ORStreamEvent{
+		Type:           "response.output_text.done",
+		SequenceNumber: sequenceNumber,
+		ItemID:         currentMessageID,
+		OutputIndex:    &outputIndex,
+		ContentIndex:   &currentContentIndex,
+		Text:           strPtr(result),
+		Logprobs:       logprobsPtr(mcpStreamLogprobs),
+	})
+	sequenceNumber++

-		currentContentIndex = 0
-		emptyTextPart := makeOutputTextPart("")
-		sendSSEEvent(c, &schema.ORStreamEvent{
-			Type:           "response.content_part.added",
-			SequenceNumber: sequenceNumber,
-			ItemID:         currentMessageID,
-			OutputIndex:    &outputIndex,
-			ContentIndex:   &currentContentIndex,
-			Part:           &emptyTextPart,
-		})
-		sequenceNumber++
-	}
+	// Emit content_part.done (with actual logprobs)
+	resultPart := makeOutputTextPartWithLogprobs(result, noToolLogprobs)
+	sendSSEEvent(c, &schema.ORStreamEvent{
+		Type:           "response.content_part.done",
+		SequenceNumber: sequenceNumber,
+		ItemID:         currentMessageID,
+		OutputIndex:    &outputIndex,
+		ContentIndex:   &currentContentIndex,
+		Part:           &resultPart,
+	})
+	sequenceNumber++

-	if messageItem != nil {
-		// Emit output_text.done
-		sendSSEEvent(c, &schema.ORStreamEvent{
-			Type:           "response.output_text.done",
-			SequenceNumber: sequenceNumber,
-			ItemID:         currentMessageID,
-			OutputIndex:    &outputIndex,
-			ContentIndex:   &currentContentIndex,
-			Text:           strPtr(result),
-			Logprobs:       logprobsPtr(mcpStreamLogprobs),
-		})
-		sequenceNumber++
-
-		// Emit content_part.done (with actual logprobs)
-		resultPart := makeOutputTextPartWithLogprobs(result, noToolLogprobs)
-		sendSSEEvent(c, &schema.ORStreamEvent{
-			Type:           "response.content_part.done",
-			SequenceNumber: sequenceNumber,
-			ItemID:         currentMessageID,
-			OutputIndex:    &outputIndex,
-			ContentIndex:   &currentContentIndex,
-			Part:           &resultPart,
-		})
-		sequenceNumber++
-
-		// Emit output_item.done (with actual logprobs)
-		messageItem.Status = "completed"
-		messageItem.Content = []schema.ORContentPart{makeOutputTextPartWithLogprobs(result, noToolLogprobs)}
-		sendSSEEvent(c, &schema.ORStreamEvent{
-			Type:           "response.output_item.done",
-			SequenceNumber: sequenceNumber,
-			OutputIndex:    &outputIndex,
-			Item:           messageItem,
-		})
-		sequenceNumber++
-	}
+	// Emit output_item.done (with actual logprobs)
+	messageItem.Status = "completed"
+	messageItem.Content = []schema.ORContentPart{makeOutputTextPartWithLogprobs(result, noToolLogprobs)}
+	sendSSEEvent(c, &schema.ORStreamEvent{
+		Type:           "response.output_item.done",
+		SequenceNumber: sequenceNumber,
+		OutputIndex:    &outputIndex,
+		Item:           messageItem,
+	})
+	sequenceNumber++

 	// Emit function_call items from automatic tool parsing fallback
 	for _, fc := range streamFallbackToolCalls {
@@ -2663,13 +2631,10 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 	// Emit response.completed
 	now := time.Now().Unix()

-	// Collect final output items, ordered reasoning -> message -> tool calls.
-	// Issue #9658: reasoning is emitted as its own item ahead of the message,
-	// matching the streamed order (reasoning item is opened before the message
-	// item when the model thinks first).
+	// Collect final output items (reasoning first, then messages, then tool calls)
 	var finalOutputItems []schema.ORItemField
-	// Add reasoning item if one was streamed.
-	if router.ReasoningStreamed() && finalReasoning != "" {
+	// Add reasoning item if it exists
+	if currentReasoningID != "" && finalReasoning != "" {
 		finalOutputItems = append(finalOutputItems, schema.ORItemField{
 			Type:    "reasoning",
 			ID:      currentReasoningID,
@@ -2677,12 +2642,18 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 			Content: []schema.ORContentPart{makeOutputTextPart(finalReasoning)},
 		})
 	}
-	// Add the message item if one was produced (created lazily, so it may be
-	// nil for a pure-reasoning turn).
-	if messageItem != nil {
+	// Add message item
+	if len(collectedOutputItems) > 0 {
+		// Use collected items (may include reasoning already)
+		for _, item := range collectedOutputItems {
+			if item.Type == "message" {
+				finalOutputItems = append(finalOutputItems, item)
+			}
+		}
+	} else {
 		finalOutputItems = append(finalOutputItems, *messageItem)
 	}
-	// Add function_call items from fallback parsing.
+	// Add function_call items from fallback
 	for _, item := range collectedOutputItems {
 		if item.Type == "function_call" {
 			finalOutputItems = append(finalOutputItems, item)
--- a/core/http/endpoints/openresponses/responses_stream_reasoning.go
+++ b/core/http/endpoints/openresponses/responses_stream_reasoning.go
@@ -1,114 +0,0 @@
-package openresponses
-
-import (
-	"github.com/mudler/LocalAI/core/backend"
-	reason "github.com/mudler/LocalAI/pkg/reasoning"
-)
-
-// streamTokenRouting describes how a single streamed token's deltas should be
-// routed to Open Responses output items: the reasoning/content split and
-// whether a new reasoning or message output item must be opened before the
-// corresponding delta can be emitted.
-type streamTokenRouting struct {
-	ReasoningDelta string
-	ContentDelta   string
-	// OpenReasoningItem is true when a reasoning output item must be created
-	// before emitting ReasoningDelta (the first reasoning delta of the stream).
-	OpenReasoningItem bool
-	// OpenMessageItem is true when a message output item must be created before
-	// emitting ContentDelta (the first content delta of the stream).
-	OpenMessageItem bool
-}
-
-// streamReasoningRouter classifies streamed tokens into reasoning vs message
-// deltas and tracks which output items have been opened, so the SSE-emitting
-// code in handleOpenResponsesStream becomes a thin shell over a unit-testable
-// decision.
-//
-// It mirrors the sticky-preferAutoparser logic in the OpenAI chat streaming
-// worker (core/http/endpoints/openai/chat_stream_workers.go, processStream):
-// once the C++ autoparser has surfaced reasoning_content, we trust its
-// classification for the rest of the stream; until then we fall back to the
-// Go-side reasoning extractor so a pure-content autoparser (the non-jinja PEG
-// fallback, issue #9985) does not leak <think>...</think> tokens into content.
-//
-// Crucially, the decision to open and target a reasoning item keys off the
-// per-token reasoningDelta, NOT extractor.Reasoning(): the autoparser path
-// computes reasoning through ProcessChatDeltaReasoning, which updates a
-// separate accumulator that extractor.Reasoning() never exposes. Gating on
-// extractor.Reasoning() (issue #9658) dropped live reasoning whenever the
-// autoparser drove it via reasoning_content, surfacing it only after the
-// stream completed and mis-routing earlier deltas onto the msg_ item.
-type streamReasoningRouter struct {
-	extractor        *reason.ReasoningExtractor
-	preferAutoparser bool
-	reasoningOpened  bool
-	messageOpened    bool
-}
-
-func newStreamReasoningRouter(extractor *reason.ReasoningExtractor) *streamReasoningRouter {
-	return &streamReasoningRouter{extractor: extractor}
-}
-
-// classify splits a token into reasoning/content deltas using the sticky
-// preferAutoparser preference. Once the C++ autoparser has surfaced
-// reasoning_content we trust it for the rest of the stream; until then we fall
-// back to the Go-side extractor so a pure-content autoparser (zero
-// reasoning_content, issue #9985) does not leak <think>...</think> tokens into
-// content.
-func (r *streamReasoningRouter) classify(token string, usage backend.TokenUsage) (reasoningDelta, contentDelta string) {
-	goReasoning, goContent := r.extractor.ProcessToken(token)
-	if usage.HasChatDeltaContent() {
-		rawReasoning, cd := usage.ChatDeltaReasoningAndContent()
-		if rawReasoning != "" {
-			r.preferAutoparser = true
-		}
-		if r.preferAutoparser {
-			contentDelta = cd
-			reasoningDelta = r.extractor.ProcessChatDeltaReasoning(rawReasoning)
-		} else {
-			reasoningDelta = goReasoning
-			contentDelta = goContent
-		}
-	} else {
-		reasoningDelta = goReasoning
-		contentDelta = goContent
-	}
-	return reasoningDelta, contentDelta
-}
-
-// route classifies a token and decides which output items its deltas target,
-// flipping the opened-flags as items are created.
-//
-// The reasoning gate keys off reasoningDelta, NOT extractor.Reasoning(): the
-// autoparser path computes reasoning via ProcessChatDeltaReasoning into a
-// separate accumulator that extractor.Reasoning() never reflects (issue #9658).
-func (r *streamReasoningRouter) route(token string, usage backend.TokenUsage) streamTokenRouting {
-	reasoningDelta, contentDelta := r.classify(token, usage)
-	out := streamTokenRouting{ReasoningDelta: reasoningDelta, ContentDelta: contentDelta}
-	if reasoningDelta != "" && !r.reasoningOpened {
-		out.OpenReasoningItem = true
-		r.reasoningOpened = true
-	}
-	if contentDelta != "" && !r.messageOpened {
-		out.OpenMessageItem = true
-		r.messageOpened = true
-	}
-	return out
-}
-
-// resetForIteration clears the per-stream routing state for an MCP re-inference
-// iteration, mirroring extractor.Reset() on the underlying extractor.
-func (r *streamReasoningRouter) resetForIteration() {
-	r.preferAutoparser = false
-	r.reasoningOpened = false
-	r.messageOpened = false
-	r.extractor.Reset()
-}
-
-// ReasoningStreamed reports whether a reasoning output item was opened during
-// the stream. The end-of-stream closing blocks key off this rather than a
-// reasoning-id string so the ordering (reasoning before message) is explicit.
-func (r *streamReasoningRouter) ReasoningStreamed() bool {
-	return r.reasoningOpened
-}
--- a/core/http/endpoints/openresponses/responses_stream_reasoning_test.go
+++ b/core/http/endpoints/openresponses/responses_stream_reasoning_test.go
@@ -1,101 +0,0 @@
-package openresponses
-
-import (
-	"github.com/mudler/LocalAI/core/backend"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	reason "github.com/mudler/LocalAI/pkg/reasoning"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-// usageWithChatDeltas builds a TokenUsage carrying a single C++ autoparser
-// ChatDelta with the given content / reasoning_content split.
-func usageWithChatDeltas(content, reasoningContent string) backend.TokenUsage {
-	return backend.TokenUsage{
-		ChatDeltas: []*pb.ChatDelta{
-			{Content: content, ReasoningContent: reasoningContent},
-		},
-	}
-}
-
-// Regression tests for issue #9658: in the /v1/responses streaming handler the
-// thinking monologue from a reasoning model was streamed to the client as a
-// normal message (msg_ item, output_text.delta) instead of as a reasoning
-// item, and was only re-classified into a reasoning item AFTER the stream
-// completed.
-//
-// Root cause: the live reasoning item was gated on extractor.Reasoning(),
-// which is only updated by the Go-side raw-tag parser (ProcessToken). When the
-// C++ autoparser drives reasoning through reasoning_content ChatDeltas, the
-// reasoning is computed via ProcessChatDeltaReasoning into a SEPARATE
-// accumulator, so extractor.Reasoning() stays empty and the gate never fires.
-var _ = Describe("streamReasoningRouter", func() {
-	Context("autoparser drives reasoning via reasoning_content (issue #9658)", func() {
-		It("opens a reasoning item during streaming and targets it (not the message)", func() {
-			extractor := reason.NewReasoningExtractor("", reason.Config{})
-			router := newStreamReasoningRouter(extractor)
-
-			// The raw token is empty: the autoparser carries the reasoning in
-			// ChatDelta.ReasoningContent, so the Go-side extractor's
-			// Reasoning() stays "" — exactly the state in which the buggy
-			// extractor.Reasoning() gate failed to open a reasoning item.
-			routing := router.route("", usageWithChatDeltas("", "Let me think about this"))
-
-			Expect(routing.ReasoningDelta).To(Equal("Let me think about this"),
-				"the autoparser's reasoning_content must surface as a reasoning delta during streaming")
-			Expect(routing.OpenReasoningItem).To(BeTrue(),
-				"a reasoning output item must be opened live, not deferred to end-of-stream (#9658)")
-			Expect(routing.ContentDelta).To(BeEmpty())
-			Expect(routing.OpenMessageItem).To(BeFalse(),
-				"reasoning deltas must target the reasoning_ item, never open/route to a msg_ item")
-		})
-
-		It("does not re-open the reasoning item on subsequent reasoning deltas", func() {
-			extractor := reason.NewReasoningExtractor("", reason.Config{})
-			router := newStreamReasoningRouter(extractor)
-
-			_ = router.route("", usageWithChatDeltas("", "first "))
-			routing := router.route("", usageWithChatDeltas("", "second"))
-
-			Expect(routing.ReasoningDelta).To(Equal("second"))
-			Expect(routing.OpenReasoningItem).To(BeFalse())
-		})
-	})
-
-	Context("pure content stream", func() {
-		It("never opens a reasoning item", func() {
-			extractor := reason.NewReasoningExtractor("", reason.Config{})
-			router := newStreamReasoningRouter(extractor)
-
-			// Content-only with no reasoning_content: the autoparser is in its
-			// pure-content mode, so the router stays on the Go-side extractor,
-			// which sees the content via the raw token.
-			routing := router.route("hello world", usageWithChatDeltas("hello world", ""))
-
-			Expect(routing.ContentDelta).To(Equal("hello world"))
-			Expect(routing.OpenMessageItem).To(BeTrue())
-			Expect(routing.OpenReasoningItem).To(BeFalse(),
-				"a content-only stream must never open a reasoning item")
-			Expect(router.ReasoningStreamed()).To(BeFalse())
-		})
-	})
-
-	Context("content-only autoparser with embedded <think> (issue #9985 fallback)", func() {
-		It("falls back to Go-side extraction instead of leaking <think> into content", func() {
-			extractor := reason.NewReasoningExtractor("", reason.Config{})
-			router := newStreamReasoningRouter(extractor)
-
-			// The autoparser is in its non-jinja pure-content fallback: it
-			// surfaces the whole string as Content with zero reasoning_content,
-			// tags and all. The router must NOT trust it (preferAutoparser must
-			// stay false) and instead use the Go-side split.
-			routing := router.route("<think>reasoning here</think>answer",
-				usageWithChatDeltas("<think>reasoning here</think>answer", ""))
-
-			Expect(routing.ContentDelta).To(Equal("answer"),
-				"content must be the cleaned answer, not the raw <think>...</think> string")
-			Expect(routing.ReasoningDelta).To(Equal("reasoning here"))
-			Expect(routing.OpenReasoningItem).To(BeTrue())
-		})
-	})
-})