mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-12 18:58:49 -04:00
Compare commits
1 Commits
fix/9658-r
...
fix/7461-m
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69c7a8e71d |
@@ -5,6 +5,31 @@ imported by any backend that needs to parse LocalAI gRPC options or build a
|
||||
chat-template-compatible message list from proto Message objects.
|
||||
"""
|
||||
import json
|
||||
from urllib.parse import unquote
|
||||
|
||||
|
||||
def resolve_model_path(model, model_file=""):
|
||||
"""Resolve a LocalAI model reference to something an HF/MLX loader accepts.
|
||||
|
||||
LocalAI hands backends either a plain HuggingFace repo id
|
||||
(``namespace/name``), an already-local filesystem path, or a
|
||||
``file://`` URI (its ``LocalPrefix``) for models imported from disk.
|
||||
Loaders such as ``mlx_lm.load`` reject the ``file://`` form because the
|
||||
scheme is neither a valid repo id nor an existing path, so we normalize
|
||||
it here before loading.
|
||||
|
||||
Resolution order:
|
||||
1. Prefer ``model_file`` when set and non-empty - that is the resolved
|
||||
local path LocalAI computed for the model.
|
||||
2. Strip a ``file://`` scheme and percent-decode it to a plain path.
|
||||
3. Leave plain repo ids and already-local paths unchanged.
|
||||
"""
|
||||
candidate = model_file if model_file else model
|
||||
if candidate is None:
|
||||
return candidate
|
||||
if candidate.startswith("file://"):
|
||||
return unquote(candidate[len("file://"):])
|
||||
return candidate
|
||||
|
||||
|
||||
def parse_options(options_list):
|
||||
|
||||
@@ -28,7 +28,7 @@ import grpc
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
|
||||
from grpc_auth import get_auth_interceptors
|
||||
from python_utils import messages_to_dicts, parse_options as _shared_parse_options
|
||||
from python_utils import messages_to_dicts, parse_options as _shared_parse_options, resolve_model_path
|
||||
from mlx_utils import parse_tool_calls, split_reasoning
|
||||
|
||||
|
||||
@@ -99,7 +99,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
from mlx_lm import load
|
||||
from mlx_lm.models.cache import make_prompt_cache, can_trim_prompt_cache, trim_prompt_cache
|
||||
|
||||
print(f"[Rank 0] Loading model: {request.Model}", file=sys.stderr)
|
||||
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
|
||||
# and prefer the resolved ModelFile so mlx_lm.load() gets a plain
|
||||
# repo id or filesystem path (it rejects file:// URIs).
|
||||
model_path = resolve_model_path(request.Model, request.ModelFile)
|
||||
print(f"[Rank 0] Loading model: {model_path}", file=sys.stderr)
|
||||
|
||||
self.options = parse_options(request.Options)
|
||||
print(f"Options: {self.options}", file=sys.stderr)
|
||||
@@ -128,7 +132,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
)
|
||||
self.coordinator = DistributedCoordinator(self.group)
|
||||
self.coordinator.broadcast_command(CMD_LOAD_MODEL)
|
||||
self.coordinator.broadcast_model_name(request.Model)
|
||||
self.coordinator.broadcast_model_name(model_path)
|
||||
else:
|
||||
print("[Rank 0] No hostfile configured, running single-node", file=sys.stderr)
|
||||
|
||||
@@ -144,9 +148,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
if tokenizer_config:
|
||||
print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
|
||||
self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
|
||||
self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
|
||||
else:
|
||||
self.model, self.tokenizer = load(request.Model)
|
||||
self.model, self.tokenizer = load(model_path)
|
||||
|
||||
if self.group is not None:
|
||||
from sharding import pipeline_auto_parallel
|
||||
@@ -157,7 +161,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
from mlx_cache import ThreadSafeLRUPromptCache
|
||||
max_cache_entries = self.options.get("max_cache_entries", 10)
|
||||
self.max_kv_size = self.options.get("max_kv_size", None)
|
||||
self.model_key = request.Model
|
||||
self.model_key = model_path
|
||||
self.lru_cache = ThreadSafeLRUPromptCache(
|
||||
max_size=max_cache_entries,
|
||||
can_trim_fn=can_trim_prompt_cache,
|
||||
|
||||
@@ -18,7 +18,7 @@ import grpc
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
|
||||
from grpc_auth import get_auth_interceptors
|
||||
from python_utils import messages_to_dicts, parse_options
|
||||
from python_utils import messages_to_dicts, parse_options, resolve_model_path
|
||||
from mlx_utils import parse_tool_calls, split_reasoning
|
||||
|
||||
from mlx_vlm import load, stream_generate
|
||||
@@ -67,7 +67,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
backend_pb2.Result: The load model result.
|
||||
"""
|
||||
try:
|
||||
print(f"Loading MLX-VLM model: {request.Model}", file=sys.stderr)
|
||||
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
|
||||
# and prefer the resolved ModelFile so mlx_vlm.load() gets a plain
|
||||
# repo id or filesystem path (it rejects file:// URIs).
|
||||
model_path = resolve_model_path(request.Model, request.ModelFile)
|
||||
print(f"Loading MLX-VLM model: {model_path}", file=sys.stderr)
|
||||
print(f"Request: {request}", file=sys.stderr)
|
||||
|
||||
# Parse Options[] key:value strings into a typed dict
|
||||
@@ -76,10 +80,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
# Load model and processor using MLX-VLM
|
||||
# mlx-vlm load function returns (model, processor) instead of (model, tokenizer)
|
||||
self.model, self.processor = load(request.Model)
|
||||
self.model, self.processor = load(model_path)
|
||||
|
||||
# Load model config for chat template support
|
||||
self.config = load_config(request.Model)
|
||||
self.config = load_config(model_path)
|
||||
|
||||
# Auto-infer the tool parser from the chat template. mlx-vlm has
|
||||
# its own _infer_tool_parser that falls back to mlx-lm parsers.
|
||||
|
||||
@@ -17,7 +17,7 @@ import grpc
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
|
||||
from grpc_auth import get_auth_interceptors
|
||||
from python_utils import messages_to_dicts, parse_options
|
||||
from python_utils import messages_to_dicts, parse_options, resolve_model_path
|
||||
from mlx_utils import parse_tool_calls, split_reasoning
|
||||
|
||||
from mlx_lm import load, stream_generate
|
||||
@@ -63,7 +63,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
backend_pb2.Result: The load model result.
|
||||
"""
|
||||
try:
|
||||
print(f"Loading MLX model: {request.Model}", file=sys.stderr)
|
||||
# Normalize the model reference: strip LocalAI's file:// LocalPrefix
|
||||
# and prefer the resolved ModelFile so mlx_lm.load() gets a plain
|
||||
# repo id or filesystem path (it rejects file:// URIs).
|
||||
model_path = resolve_model_path(request.Model, request.ModelFile)
|
||||
print(f"Loading MLX model: {model_path}", file=sys.stderr)
|
||||
print(f"Request: {request}", file=sys.stderr)
|
||||
|
||||
# Parse Options[] key:value strings into a typed dict (shared helper)
|
||||
@@ -89,9 +93,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
# Load model and tokenizer using MLX
|
||||
if tokenizer_config:
|
||||
print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
|
||||
self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
|
||||
self.model, self.tokenizer = load(model_path, tokenizer_config=tokenizer_config)
|
||||
else:
|
||||
self.model, self.tokenizer = load(request.Model)
|
||||
self.model, self.tokenizer = load(model_path)
|
||||
|
||||
# mlx_lm.load() returns a TokenizerWrapper that detects tool
|
||||
# calling and thinking markers from the chat template / vocab.
|
||||
@@ -111,7 +115,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
# Initialize thread-safe LRU prompt cache for efficient generation
|
||||
max_cache_entries = self.options.get("max_cache_entries", 10)
|
||||
self.max_kv_size = self.options.get("max_kv_size", None)
|
||||
self.model_key = request.Model
|
||||
self.model_key = model_path
|
||||
self.lru_cache = ThreadSafeLRUPromptCache(
|
||||
max_size=max_cache_entries,
|
||||
can_trim_fn=can_trim_prompt_cache,
|
||||
|
||||
@@ -12,7 +12,7 @@ import backend_pb2_grpc
|
||||
# Make the shared helpers importable so we can unit-test them without a
|
||||
# running gRPC server.
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
|
||||
from python_utils import messages_to_dicts, parse_options
|
||||
from python_utils import messages_to_dicts, parse_options, resolve_model_path
|
||||
from mlx_utils import parse_tool_calls, split_reasoning
|
||||
|
||||
class TestBackendServicer(unittest.TestCase):
|
||||
@@ -322,6 +322,42 @@ class TestSharedHelpers(unittest.TestCase):
|
||||
self.assertEqual(r, "")
|
||||
self.assertEqual(c, "just text")
|
||||
|
||||
def test_resolve_model_path_file_uri(self):
|
||||
# file:// LocalPrefix (LocalAI import) is stripped to a plain path.
|
||||
self.assertEqual(resolve_model_path("file:///a/b"), "/a/b")
|
||||
|
||||
def test_resolve_model_path_file_uri_percent_decoded(self):
|
||||
# Percent-encoded characters (e.g. spaces) are decoded.
|
||||
self.assertEqual(
|
||||
resolve_model_path("file:///Users/me/My%20Models/Qwen3"),
|
||||
"/Users/me/My Models/Qwen3",
|
||||
)
|
||||
|
||||
def test_resolve_model_path_hf_repo_id_unchanged(self):
|
||||
# Plain HuggingFace repo ids must pass through untouched.
|
||||
self.assertEqual(
|
||||
resolve_model_path("mlx-community/Qwen3-Coder-30B"),
|
||||
"mlx-community/Qwen3-Coder-30B",
|
||||
)
|
||||
|
||||
def test_resolve_model_path_local_path_unchanged(self):
|
||||
# An already-local absolute path is left as-is.
|
||||
self.assertEqual(resolve_model_path("/models/Qwen3"), "/models/Qwen3")
|
||||
|
||||
def test_resolve_model_path_prefers_model_file(self):
|
||||
# The resolved ModelFile wins over Model when both are set.
|
||||
self.assertEqual(
|
||||
resolve_model_path("file:///ignored", "/resolved/local/path"),
|
||||
"/resolved/local/path",
|
||||
)
|
||||
|
||||
def test_resolve_model_path_model_file_file_uri(self):
|
||||
# A ModelFile that is itself a file:// URI is also normalized.
|
||||
self.assertEqual(
|
||||
resolve_model_path("ignored", "file:///a/b"),
|
||||
"/a/b",
|
||||
)
|
||||
|
||||
def test_parse_tool_calls_with_shim(self):
|
||||
tm = types.SimpleNamespace(
|
||||
tool_call_start="<tool_call>",
|
||||
|
||||
@@ -1648,12 +1648,6 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
var currentReasoningContentIndex int
|
||||
var reasoningTokens int
|
||||
extractor := reason.NewReasoningExtractor(thinkingStartToken, cfg.ReasoningConfig)
|
||||
// router classifies each streamed token into reasoning vs message deltas
|
||||
// and decides which output item they target. It encapsulates the
|
||||
// sticky-preferAutoparser fallback and the reasoningDelta-based gate that
|
||||
// fix issue #9658 (live reasoning was mis-routed onto the msg_ item and
|
||||
// only re-classified as a reasoning item after the stream completed).
|
||||
router := newStreamReasoningRouter(extractor)
|
||||
|
||||
// Collect all output items for storage
|
||||
var collectedOutputItems []schema.ORItemField
|
||||
@@ -1677,7 +1671,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
// Reset reasoning and tool-call state for re-inference so reasoning
|
||||
// extraction runs again on subsequent iterations
|
||||
inToolCallMode = false
|
||||
router.resetForIteration()
|
||||
extractor.Reset()
|
||||
currentMessageID = ""
|
||||
lastEmittedToolCallCount = 0
|
||||
currentReasoningID = ""
|
||||
@@ -1838,101 +1832,110 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
|
||||
// If no tool calls detected yet, handle reasoning and text
|
||||
if !inToolCallMode {
|
||||
routing := router.route(token, tokenUsage)
|
||||
var reasoningDelta, contentDelta string
|
||||
goReasoning, goContent := extractor.ProcessToken(token)
|
||||
|
||||
// Handle reasoning item. The reasoning item is opened lazily
|
||||
// on the first reasoning delta - gating on routing, not
|
||||
// extractor.Reasoning() (issue #9658): when the C++
|
||||
// autoparser drives reasoning via reasoning_content,
|
||||
// extractor.Reasoning() stays empty and the old gate dropped
|
||||
// the live reasoning item.
|
||||
if routing.OpenReasoningItem {
|
||||
outputIndex++
|
||||
currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
|
||||
reasoningItem := &schema.ORItemField{
|
||||
Type: "reasoning",
|
||||
ID: currentReasoningID,
|
||||
Status: "in_progress",
|
||||
if tokenUsage.HasChatDeltaContent() {
|
||||
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
|
||||
contentDelta = cd
|
||||
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
|
||||
} else {
|
||||
reasoningDelta = goReasoning
|
||||
contentDelta = goContent
|
||||
}
|
||||
|
||||
// Handle reasoning item
|
||||
if extractor.Reasoning() != "" {
|
||||
// Check if we need to create reasoning item
|
||||
if currentReasoningID == "" {
|
||||
outputIndex++
|
||||
currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
|
||||
reasoningItem := &schema.ORItemField{
|
||||
Type: "reasoning",
|
||||
ID: currentReasoningID,
|
||||
Status: "in_progress",
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: reasoningItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.added for reasoning
|
||||
currentReasoningContentIndex = 0
|
||||
emptyPart := makeOutputTextPart("")
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Part: &emptyPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: reasoningItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.added for reasoning
|
||||
currentReasoningContentIndex = 0
|
||||
emptyPart := makeOutputTextPart("")
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Part: &emptyPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
}
|
||||
|
||||
// Emit reasoning delta against the reasoning_ item id.
|
||||
if routing.ReasoningDelta != "" {
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.delta",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Delta: strPtr(routing.ReasoningDelta),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
c.Response().Flush()
|
||||
}
|
||||
|
||||
// Open the message item lazily on the first content delta.
|
||||
if routing.OpenMessageItem {
|
||||
outputIndex++
|
||||
currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
|
||||
messageItem := &schema.ORItemField{
|
||||
Type: "message",
|
||||
ID: currentMessageID,
|
||||
Status: "in_progress",
|
||||
Role: "assistant",
|
||||
Content: []schema.ORContentPart{},
|
||||
// Emit reasoning delta if there's new content
|
||||
if reasoningDelta != "" {
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.delta",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Delta: strPtr(reasoningDelta),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
c.Response().Flush()
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: messageItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.added
|
||||
currentContentIndex = 0
|
||||
emptyPart := makeOutputTextPart("")
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Part: &emptyPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
}
|
||||
|
||||
// Emit text delta against the msg_ item id.
|
||||
if routing.ContentDelta != "" {
|
||||
// Only emit message content if there's actual content (not just reasoning)
|
||||
if contentDelta != "" {
|
||||
if currentMessageID == "" {
|
||||
// Emit output_item.added for message
|
||||
outputIndex++
|
||||
currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
|
||||
messageItem := &schema.ORItemField{
|
||||
Type: "message",
|
||||
ID: currentMessageID,
|
||||
Status: "in_progress",
|
||||
Role: "assistant",
|
||||
Content: []schema.ORContentPart{},
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: messageItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.added
|
||||
currentContentIndex = 0
|
||||
emptyPart := makeOutputTextPart("")
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Part: &emptyPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
}
|
||||
|
||||
// Emit text delta
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.delta",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Delta: strPtr(routing.ContentDelta),
|
||||
Delta: strPtr(contentDelta),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
@@ -2328,109 +2331,112 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
return nil
|
||||
}
|
||||
|
||||
// Non-tool-call streaming path.
|
||||
//
|
||||
// The message output item is created LAZILY on the first content delta
|
||||
// (mirroring the tool-call path), not eagerly before the first token.
|
||||
// Issue #9658: an eager msg_ item forced reasoning to a higher output
|
||||
// index and made mis-split <think> text land on the pre-existing message,
|
||||
// so the thinking monologue streamed as message text instead of reasoning.
|
||||
var messageItem *schema.ORItemField
|
||||
// Non-tool-call streaming path
|
||||
// Emit output_item.added for message
|
||||
currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
|
||||
messageItem := &schema.ORItemField{
|
||||
Type: "message",
|
||||
ID: currentMessageID,
|
||||
Status: "in_progress",
|
||||
Role: "assistant",
|
||||
Content: []schema.ORContentPart{},
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: messageItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.added
|
||||
currentContentIndex = 0
|
||||
emptyTextPart := makeOutputTextPart("")
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Part: &emptyTextPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Stream text deltas with reasoning extraction
|
||||
tokenCallback := func(token string, tokenUsage backend.TokenUsage) bool {
|
||||
accumulatedText += token
|
||||
|
||||
routing := router.route(token, tokenUsage)
|
||||
var reasoningDelta, contentDelta string
|
||||
goReasoning, goContent := extractor.ProcessToken(token)
|
||||
|
||||
// Open the reasoning item lazily on the first reasoning delta.
|
||||
if routing.OpenReasoningItem {
|
||||
outputIndex++
|
||||
currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
|
||||
reasoningItem := &schema.ORItemField{
|
||||
Type: "reasoning",
|
||||
ID: currentReasoningID,
|
||||
Status: "in_progress",
|
||||
if tokenUsage.HasChatDeltaContent() {
|
||||
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
|
||||
contentDelta = cd
|
||||
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
|
||||
} else {
|
||||
reasoningDelta = goReasoning
|
||||
contentDelta = goContent
|
||||
}
|
||||
|
||||
// Handle reasoning item
|
||||
if extractor.Reasoning() != "" {
|
||||
// Check if we need to create reasoning item
|
||||
if currentReasoningID == "" {
|
||||
outputIndex++
|
||||
currentReasoningID = fmt.Sprintf("reasoning_%s", uuid.New().String())
|
||||
reasoningItem := &schema.ORItemField{
|
||||
Type: "reasoning",
|
||||
ID: currentReasoningID,
|
||||
Status: "in_progress",
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: reasoningItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.added for reasoning
|
||||
currentReasoningContentIndex = 0
|
||||
emptyPart := makeOutputTextPart("")
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Part: &emptyPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: reasoningItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.added for reasoning
|
||||
currentReasoningContentIndex = 0
|
||||
emptyPart := makeOutputTextPart("")
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Part: &emptyPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
}
|
||||
|
||||
// Emit reasoning delta against the reasoning_ item id.
|
||||
if routing.ReasoningDelta != "" {
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.delta",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Delta: strPtr(routing.ReasoningDelta),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
c.Response().Flush()
|
||||
}
|
||||
|
||||
// Open the message item lazily on the first content delta.
|
||||
if routing.OpenMessageItem {
|
||||
outputIndex++
|
||||
currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
|
||||
messageItem = &schema.ORItemField{
|
||||
Type: "message",
|
||||
ID: currentMessageID,
|
||||
Status: "in_progress",
|
||||
Role: "assistant",
|
||||
Content: []schema.ORContentPart{},
|
||||
// Emit reasoning delta if there's new content
|
||||
if reasoningDelta != "" {
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.delta",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentReasoningID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tReasoningContentIndex,
|
||||
Delta: strPtr(reasoningDelta),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
c.Response().Flush()
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: messageItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.added
|
||||
currentContentIndex = 0
|
||||
emptyTextPart := makeOutputTextPart("")
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Part: &emptyTextPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
}
|
||||
|
||||
// Emit text delta against the msg_ item id.
|
||||
if routing.ContentDelta != "" {
|
||||
// Only emit message content if there's actual content (not just reasoning)
|
||||
if contentDelta != "" {
|
||||
// Emit text delta
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.delta",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Delta: strPtr(routing.ContentDelta),
|
||||
Delta: strPtr(contentDelta),
|
||||
Logprobs: emptyLogprobs(),
|
||||
})
|
||||
sequenceNumber++
|
||||
@@ -2555,78 +2561,40 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
// Convert logprobs for streaming events
|
||||
mcpStreamLogprobs := convertLogprobsForStreaming(noToolLogprobs)
|
||||
|
||||
// The message item is created lazily on the first content delta (issue
|
||||
// #9658). If no content streamed but final extraction produced text (e.g.
|
||||
// the autoparser delivered everything at once), open the message item now
|
||||
// so the closing events below are valid. A pure-reasoning turn (no content
|
||||
// at all) leaves messageItem nil and emits no message item.
|
||||
if messageItem == nil && result != "" {
|
||||
outputIndex++
|
||||
currentMessageID = fmt.Sprintf("msg_%s", uuid.New().String())
|
||||
messageItem = &schema.ORItemField{
|
||||
Type: "message",
|
||||
ID: currentMessageID,
|
||||
Status: "in_progress",
|
||||
Role: "assistant",
|
||||
Content: []schema.ORContentPart{},
|
||||
}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: messageItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
// Emit output_text.done
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.done",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Text: strPtr(result),
|
||||
Logprobs: logprobsPtr(mcpStreamLogprobs),
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
currentContentIndex = 0
|
||||
emptyTextPart := makeOutputTextPart("")
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.added",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Part: &emptyTextPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
}
|
||||
// Emit content_part.done (with actual logprobs)
|
||||
resultPart := makeOutputTextPartWithLogprobs(result, noToolLogprobs)
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.done",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Part: &resultPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
if messageItem != nil {
|
||||
// Emit output_text.done
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_text.done",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Text: strPtr(result),
|
||||
Logprobs: logprobsPtr(mcpStreamLogprobs),
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit content_part.done (with actual logprobs)
|
||||
resultPart := makeOutputTextPartWithLogprobs(result, noToolLogprobs)
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.content_part.done",
|
||||
SequenceNumber: sequenceNumber,
|
||||
ItemID: currentMessageID,
|
||||
OutputIndex: &outputIndex,
|
||||
ContentIndex: ¤tContentIndex,
|
||||
Part: &resultPart,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit output_item.done (with actual logprobs)
|
||||
messageItem.Status = "completed"
|
||||
messageItem.Content = []schema.ORContentPart{makeOutputTextPartWithLogprobs(result, noToolLogprobs)}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.done",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: messageItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
}
|
||||
// Emit output_item.done (with actual logprobs)
|
||||
messageItem.Status = "completed"
|
||||
messageItem.Content = []schema.ORContentPart{makeOutputTextPartWithLogprobs(result, noToolLogprobs)}
|
||||
sendSSEEvent(c, &schema.ORStreamEvent{
|
||||
Type: "response.output_item.done",
|
||||
SequenceNumber: sequenceNumber,
|
||||
OutputIndex: &outputIndex,
|
||||
Item: messageItem,
|
||||
})
|
||||
sequenceNumber++
|
||||
|
||||
// Emit function_call items from automatic tool parsing fallback
|
||||
for _, fc := range streamFallbackToolCalls {
|
||||
@@ -2663,13 +2631,10 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
// Emit response.completed
|
||||
now := time.Now().Unix()
|
||||
|
||||
// Collect final output items, ordered reasoning -> message -> tool calls.
|
||||
// Issue #9658: reasoning is emitted as its own item ahead of the message,
|
||||
// matching the streamed order (reasoning item is opened before the message
|
||||
// item when the model thinks first).
|
||||
// Collect final output items (reasoning first, then messages, then tool calls)
|
||||
var finalOutputItems []schema.ORItemField
|
||||
// Add reasoning item if one was streamed.
|
||||
if router.ReasoningStreamed() && finalReasoning != "" {
|
||||
// Add reasoning item if it exists
|
||||
if currentReasoningID != "" && finalReasoning != "" {
|
||||
finalOutputItems = append(finalOutputItems, schema.ORItemField{
|
||||
Type: "reasoning",
|
||||
ID: currentReasoningID,
|
||||
@@ -2677,12 +2642,18 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
Content: []schema.ORContentPart{makeOutputTextPart(finalReasoning)},
|
||||
})
|
||||
}
|
||||
// Add the message item if one was produced (created lazily, so it may be
|
||||
// nil for a pure-reasoning turn).
|
||||
if messageItem != nil {
|
||||
// Add message item
|
||||
if len(collectedOutputItems) > 0 {
|
||||
// Use collected items (may include reasoning already)
|
||||
for _, item := range collectedOutputItems {
|
||||
if item.Type == "message" {
|
||||
finalOutputItems = append(finalOutputItems, item)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
finalOutputItems = append(finalOutputItems, *messageItem)
|
||||
}
|
||||
// Add function_call items from fallback parsing.
|
||||
// Add function_call items from fallback
|
||||
for _, item := range collectedOutputItems {
|
||||
if item.Type == "function_call" {
|
||||
finalOutputItems = append(finalOutputItems, item)
|
||||
|
||||
@@ -1,114 +0,0 @@
|
||||
package openresponses
|
||||
|
||||
import (
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
reason "github.com/mudler/LocalAI/pkg/reasoning"
|
||||
)
|
||||
|
||||
// streamTokenRouting describes how a single streamed token's deltas should be
|
||||
// routed to Open Responses output items: the reasoning/content split and
|
||||
// whether a new reasoning or message output item must be opened before the
|
||||
// corresponding delta can be emitted.
|
||||
type streamTokenRouting struct {
|
||||
ReasoningDelta string
|
||||
ContentDelta string
|
||||
// OpenReasoningItem is true when a reasoning output item must be created
|
||||
// before emitting ReasoningDelta (the first reasoning delta of the stream).
|
||||
OpenReasoningItem bool
|
||||
// OpenMessageItem is true when a message output item must be created before
|
||||
// emitting ContentDelta (the first content delta of the stream).
|
||||
OpenMessageItem bool
|
||||
}
|
||||
|
||||
// streamReasoningRouter classifies streamed tokens into reasoning vs message
|
||||
// deltas and tracks which output items have been opened, so the SSE-emitting
|
||||
// code in handleOpenResponsesStream becomes a thin shell over a unit-testable
|
||||
// decision.
|
||||
//
|
||||
// It mirrors the sticky-preferAutoparser logic in the OpenAI chat streaming
|
||||
// worker (core/http/endpoints/openai/chat_stream_workers.go, processStream):
|
||||
// once the C++ autoparser has surfaced reasoning_content, we trust its
|
||||
// classification for the rest of the stream; until then we fall back to the
|
||||
// Go-side reasoning extractor so a pure-content autoparser (the non-jinja PEG
|
||||
// fallback, issue #9985) does not leak <think>...</think> tokens into content.
|
||||
//
|
||||
// Crucially, the decision to open and target a reasoning item keys off the
|
||||
// per-token reasoningDelta, NOT extractor.Reasoning(): the autoparser path
|
||||
// computes reasoning through ProcessChatDeltaReasoning, which updates a
|
||||
// separate accumulator that extractor.Reasoning() never exposes. Gating on
|
||||
// extractor.Reasoning() (issue #9658) dropped live reasoning whenever the
|
||||
// autoparser drove it via reasoning_content, surfacing it only after the
|
||||
// stream completed and mis-routing earlier deltas onto the msg_ item.
|
||||
type streamReasoningRouter struct {
|
||||
extractor *reason.ReasoningExtractor
|
||||
preferAutoparser bool
|
||||
reasoningOpened bool
|
||||
messageOpened bool
|
||||
}
|
||||
|
||||
func newStreamReasoningRouter(extractor *reason.ReasoningExtractor) *streamReasoningRouter {
|
||||
return &streamReasoningRouter{extractor: extractor}
|
||||
}
|
||||
|
||||
// classify splits a token into reasoning/content deltas using the sticky
|
||||
// preferAutoparser preference. Once the C++ autoparser has surfaced
|
||||
// reasoning_content we trust it for the rest of the stream; until then we fall
|
||||
// back to the Go-side extractor so a pure-content autoparser (zero
|
||||
// reasoning_content, issue #9985) does not leak <think>...</think> tokens into
|
||||
// content.
|
||||
func (r *streamReasoningRouter) classify(token string, usage backend.TokenUsage) (reasoningDelta, contentDelta string) {
|
||||
goReasoning, goContent := r.extractor.ProcessToken(token)
|
||||
if usage.HasChatDeltaContent() {
|
||||
rawReasoning, cd := usage.ChatDeltaReasoningAndContent()
|
||||
if rawReasoning != "" {
|
||||
r.preferAutoparser = true
|
||||
}
|
||||
if r.preferAutoparser {
|
||||
contentDelta = cd
|
||||
reasoningDelta = r.extractor.ProcessChatDeltaReasoning(rawReasoning)
|
||||
} else {
|
||||
reasoningDelta = goReasoning
|
||||
contentDelta = goContent
|
||||
}
|
||||
} else {
|
||||
reasoningDelta = goReasoning
|
||||
contentDelta = goContent
|
||||
}
|
||||
return reasoningDelta, contentDelta
|
||||
}
|
||||
|
||||
// route classifies a token and decides which output items its deltas target,
|
||||
// flipping the opened-flags as items are created.
|
||||
//
|
||||
// The reasoning gate keys off reasoningDelta, NOT extractor.Reasoning(): the
|
||||
// autoparser path computes reasoning via ProcessChatDeltaReasoning into a
|
||||
// separate accumulator that extractor.Reasoning() never reflects (issue #9658).
|
||||
func (r *streamReasoningRouter) route(token string, usage backend.TokenUsage) streamTokenRouting {
|
||||
reasoningDelta, contentDelta := r.classify(token, usage)
|
||||
out := streamTokenRouting{ReasoningDelta: reasoningDelta, ContentDelta: contentDelta}
|
||||
if reasoningDelta != "" && !r.reasoningOpened {
|
||||
out.OpenReasoningItem = true
|
||||
r.reasoningOpened = true
|
||||
}
|
||||
if contentDelta != "" && !r.messageOpened {
|
||||
out.OpenMessageItem = true
|
||||
r.messageOpened = true
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// resetForIteration clears the per-stream routing state for an MCP re-inference
|
||||
// iteration, mirroring extractor.Reset() on the underlying extractor.
|
||||
func (r *streamReasoningRouter) resetForIteration() {
|
||||
r.preferAutoparser = false
|
||||
r.reasoningOpened = false
|
||||
r.messageOpened = false
|
||||
r.extractor.Reset()
|
||||
}
|
||||
|
||||
// ReasoningStreamed reports whether a reasoning output item was opened during
|
||||
// the stream. The end-of-stream closing blocks key off this rather than a
|
||||
// reasoning-id string so the ordering (reasoning before message) is explicit.
|
||||
func (r *streamReasoningRouter) ReasoningStreamed() bool {
|
||||
return r.reasoningOpened
|
||||
}
|
||||
@@ -1,101 +0,0 @@
|
||||
package openresponses
|
||||
|
||||
import (
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
reason "github.com/mudler/LocalAI/pkg/reasoning"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// usageWithChatDeltas builds a TokenUsage carrying a single C++ autoparser
|
||||
// ChatDelta with the given content / reasoning_content split.
|
||||
func usageWithChatDeltas(content, reasoningContent string) backend.TokenUsage {
|
||||
return backend.TokenUsage{
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{Content: content, ReasoningContent: reasoningContent},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Regression tests for issue #9658: in the /v1/responses streaming handler the
|
||||
// thinking monologue from a reasoning model was streamed to the client as a
|
||||
// normal message (msg_ item, output_text.delta) instead of as a reasoning
|
||||
// item, and was only re-classified into a reasoning item AFTER the stream
|
||||
// completed.
|
||||
//
|
||||
// Root cause: the live reasoning item was gated on extractor.Reasoning(),
|
||||
// which is only updated by the Go-side raw-tag parser (ProcessToken). When the
|
||||
// C++ autoparser drives reasoning through reasoning_content ChatDeltas, the
|
||||
// reasoning is computed via ProcessChatDeltaReasoning into a SEPARATE
|
||||
// accumulator, so extractor.Reasoning() stays empty and the gate never fires.
|
||||
var _ = Describe("streamReasoningRouter", func() {
|
||||
Context("autoparser drives reasoning via reasoning_content (issue #9658)", func() {
|
||||
It("opens a reasoning item during streaming and targets it (not the message)", func() {
|
||||
extractor := reason.NewReasoningExtractor("", reason.Config{})
|
||||
router := newStreamReasoningRouter(extractor)
|
||||
|
||||
// The raw token is empty: the autoparser carries the reasoning in
|
||||
// ChatDelta.ReasoningContent, so the Go-side extractor's
|
||||
// Reasoning() stays "" — exactly the state in which the buggy
|
||||
// extractor.Reasoning() gate failed to open a reasoning item.
|
||||
routing := router.route("", usageWithChatDeltas("", "Let me think about this"))
|
||||
|
||||
Expect(routing.ReasoningDelta).To(Equal("Let me think about this"),
|
||||
"the autoparser's reasoning_content must surface as a reasoning delta during streaming")
|
||||
Expect(routing.OpenReasoningItem).To(BeTrue(),
|
||||
"a reasoning output item must be opened live, not deferred to end-of-stream (#9658)")
|
||||
Expect(routing.ContentDelta).To(BeEmpty())
|
||||
Expect(routing.OpenMessageItem).To(BeFalse(),
|
||||
"reasoning deltas must target the reasoning_ item, never open/route to a msg_ item")
|
||||
})
|
||||
|
||||
It("does not re-open the reasoning item on subsequent reasoning deltas", func() {
|
||||
extractor := reason.NewReasoningExtractor("", reason.Config{})
|
||||
router := newStreamReasoningRouter(extractor)
|
||||
|
||||
_ = router.route("", usageWithChatDeltas("", "first "))
|
||||
routing := router.route("", usageWithChatDeltas("", "second"))
|
||||
|
||||
Expect(routing.ReasoningDelta).To(Equal("second"))
|
||||
Expect(routing.OpenReasoningItem).To(BeFalse())
|
||||
})
|
||||
})
|
||||
|
||||
Context("pure content stream", func() {
|
||||
It("never opens a reasoning item", func() {
|
||||
extractor := reason.NewReasoningExtractor("", reason.Config{})
|
||||
router := newStreamReasoningRouter(extractor)
|
||||
|
||||
// Content-only with no reasoning_content: the autoparser is in its
|
||||
// pure-content mode, so the router stays on the Go-side extractor,
|
||||
// which sees the content via the raw token.
|
||||
routing := router.route("hello world", usageWithChatDeltas("hello world", ""))
|
||||
|
||||
Expect(routing.ContentDelta).To(Equal("hello world"))
|
||||
Expect(routing.OpenMessageItem).To(BeTrue())
|
||||
Expect(routing.OpenReasoningItem).To(BeFalse(),
|
||||
"a content-only stream must never open a reasoning item")
|
||||
Expect(router.ReasoningStreamed()).To(BeFalse())
|
||||
})
|
||||
})
|
||||
|
||||
Context("content-only autoparser with embedded <think> (issue #9985 fallback)", func() {
|
||||
It("falls back to Go-side extraction instead of leaking <think> into content", func() {
|
||||
extractor := reason.NewReasoningExtractor("", reason.Config{})
|
||||
router := newStreamReasoningRouter(extractor)
|
||||
|
||||
// The autoparser is in its non-jinja pure-content fallback: it
|
||||
// surfaces the whole string as Content with zero reasoning_content,
|
||||
// tags and all. The router must NOT trust it (preferAutoparser must
|
||||
// stay false) and instead use the Go-side split.
|
||||
routing := router.route("<think>reasoning here</think>answer",
|
||||
usageWithChatDeltas("<think>reasoning here</think>answer", ""))
|
||||
|
||||
Expect(routing.ContentDelta).To(Equal("answer"),
|
||||
"content must be the cleaned answer, not the raw <think>...</think> string")
|
||||
Expect(routing.ReasoningDelta).To(Equal("reasoning here"))
|
||||
Expect(routing.OpenReasoningItem).To(BeTrue())
|
||||
})
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user