mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-09 01:07:09 -04:00
Compare commits
3 Commits
dependabot
...
fix/autopa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
184a425474 | ||
|
|
a102779c8b | ||
|
|
9323f4b5ca |
@@ -1,5 +1,5 @@
|
||||
|
||||
LLAMA_VERSION?=9e3b928fd8c9d14dbf15a8768b9fdd7e5c721d66
|
||||
LLAMA_VERSION?=28ca1e600c5dac1854fb7e09611914013430b037
|
||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -381,6 +381,15 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const
|
||||
});
|
||||
}
|
||||
|
||||
// for each video in the request, add the video data
|
||||
for (int i = 0; i < predict->videos_size(); i++) {
|
||||
data["video_data"].push_back(json
|
||||
{
|
||||
{"id", i},
|
||||
{"data", predict->videos(i)},
|
||||
});
|
||||
}
|
||||
|
||||
data["stop"] = predict->stopprompts();
|
||||
// data["n_probs"] = predict->nprobs();
|
||||
//TODO: images,
|
||||
@@ -1503,7 +1512,7 @@ public:
|
||||
msg_json["role"] = msg.role();
|
||||
|
||||
bool is_last_user_msg = (i == last_user_msg_idx);
|
||||
bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0);
|
||||
bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0);
|
||||
|
||||
// Handle content - can be string, null, or array
|
||||
// For multimodal content, we'll embed images/audio from separate fields
|
||||
@@ -1554,6 +1563,16 @@ public:
|
||||
content_array.push_back(audio_chunk);
|
||||
}
|
||||
}
|
||||
if (request->videos_size() > 0) {
|
||||
for (int j = 0; j < request->videos_size(); j++) {
|
||||
json video_chunk;
|
||||
video_chunk["type"] = "input_video";
|
||||
json input_video;
|
||||
input_video["data"] = request->videos(j);
|
||||
video_chunk["input_video"] = input_video;
|
||||
content_array.push_back(video_chunk);
|
||||
}
|
||||
}
|
||||
msg_json["content"] = content_array;
|
||||
} else {
|
||||
// Use content as-is (already array or not last user message)
|
||||
@@ -1588,6 +1607,16 @@ public:
|
||||
content_array.push_back(audio_chunk);
|
||||
}
|
||||
}
|
||||
if (request->videos_size() > 0) {
|
||||
for (int j = 0; j < request->videos_size(); j++) {
|
||||
json video_chunk;
|
||||
video_chunk["type"] = "input_video";
|
||||
json input_video;
|
||||
input_video["data"] = request->videos(j);
|
||||
video_chunk["input_video"] = input_video;
|
||||
content_array.push_back(video_chunk);
|
||||
}
|
||||
}
|
||||
msg_json["content"] = content_array;
|
||||
} else if (msg.role() == "tool") {
|
||||
// Tool role messages must have content field set, even if empty
|
||||
@@ -2039,6 +2068,16 @@ public:
|
||||
files.push_back(decoded_data);
|
||||
}
|
||||
}
|
||||
|
||||
const auto &video_data = data.find("video_data");
|
||||
if (video_data != data.end() && video_data->is_array())
|
||||
{
|
||||
for (const auto &video : *video_data)
|
||||
{
|
||||
auto decoded_data = base64_decode(video["data"].get<std::string>());
|
||||
files.push_back(decoded_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const bool has_mtmd = ctx_server.impl->mctx != nullptr;
|
||||
@@ -2291,7 +2330,7 @@ public:
|
||||
}
|
||||
|
||||
bool is_last_user_msg = (i == last_user_msg_idx);
|
||||
bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0);
|
||||
bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0);
|
||||
|
||||
// Handle content - can be string, null, or array
|
||||
// For multimodal content, we'll embed images/audio from separate fields
|
||||
@@ -2344,6 +2383,16 @@ public:
|
||||
content_array.push_back(audio_chunk);
|
||||
}
|
||||
}
|
||||
if (request->videos_size() > 0) {
|
||||
for (int j = 0; j < request->videos_size(); j++) {
|
||||
json video_chunk;
|
||||
video_chunk["type"] = "input_video";
|
||||
json input_video;
|
||||
input_video["data"] = request->videos(j);
|
||||
video_chunk["input_video"] = input_video;
|
||||
content_array.push_back(video_chunk);
|
||||
}
|
||||
}
|
||||
msg_json["content"] = content_array;
|
||||
} else {
|
||||
// Use content as-is (already array or not last user message)
|
||||
@@ -2383,6 +2432,16 @@ public:
|
||||
content_array.push_back(audio_chunk);
|
||||
}
|
||||
}
|
||||
if (request->videos_size() > 0) {
|
||||
for (int j = 0; j < request->videos_size(); j++) {
|
||||
json video_chunk;
|
||||
video_chunk["type"] = "input_video";
|
||||
json input_video;
|
||||
input_video["data"] = request->videos(j);
|
||||
video_chunk["input_video"] = input_video;
|
||||
content_array.push_back(video_chunk);
|
||||
}
|
||||
}
|
||||
msg_json["content"] = content_array;
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d created content array with media\n", i);
|
||||
} else if (!msg.tool_calls().empty()) {
|
||||
@@ -2845,6 +2904,16 @@ public:
|
||||
files.push_back(decoded_data);
|
||||
}
|
||||
}
|
||||
|
||||
const auto &video_data = data.find("video_data");
|
||||
if (video_data != data.end() && video_data->is_array())
|
||||
{
|
||||
for (const auto &video : *video_data)
|
||||
{
|
||||
auto decoded_data = base64_decode(video["data"].get<std::string>());
|
||||
files.push_back(decoded_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// process files
|
||||
|
||||
@@ -103,7 +103,12 @@ func applyAutoparserOverride(
|
||||
// blocks like "<think></think>" that some models emit when reasoning
|
||||
// is disabled.
|
||||
if deltaReasoning == "" && deltaContent != "" {
|
||||
deltaReasoning, deltaContent = reason.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, reasoningConfig)
|
||||
// Complete-response extraction: only honor a prefilled <think> start
|
||||
// token when deltaContent actually closes the reasoning block. Without
|
||||
// it the model answered directly and the whole answer must stay in
|
||||
// content rather than be swallowed as unclosed reasoning. See
|
||||
// reason.ExtractReasoningComplete.
|
||||
deltaReasoning, deltaContent = reason.ExtractReasoningComplete(deltaContent, thinkingStartToken, reasoningConfig)
|
||||
}
|
||||
xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
|
||||
"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
|
||||
|
||||
@@ -186,6 +186,114 @@ var _ = Describe("applyAutoparserOverride", func() {
|
||||
Expect(result).To(Equal(existing))
|
||||
})
|
||||
})
|
||||
|
||||
// Regression tests for the prefilled-thinking-token path (thinkingStartToken
|
||||
// != ""). This is the configuration the gallery qwen3 family runs in: the
|
||||
// chat template injects <think> into the prompt, so DetectThinkingStartToken
|
||||
// returns "<think>" and the model's output begins *inside* a reasoning block
|
||||
// — it emits a closing </think> but no opening tag.
|
||||
//
|
||||
// The defensive Go-side fallback prepends the start token so the standard
|
||||
// extractor can pair it with the model's </think>. But on a *complete*
|
||||
// response that contains NO closing tag (the model answered directly with no
|
||||
// reasoning at all), prepending <think> manufactures an unclosed block that
|
||||
// swallows the entire answer into reasoning, leaving content empty. That is
|
||||
// the bug: short/direct answers (session names, JSON summaries) come back
|
||||
// with an empty content field.
|
||||
Context("autoparser delivered content with empty reasoning and a prefilled thinking token", func() {
|
||||
const startToken = "<think>"
|
||||
|
||||
It("keeps a tag-less direct answer as content instead of swallowing it as reasoning", func() {
|
||||
// Model answered directly: no <think>, no </think> anywhere.
|
||||
chatDeltas := []*pb.ChatDelta{
|
||||
{Content: "hello", ReasoningContent: ""},
|
||||
}
|
||||
|
||||
result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
|
||||
|
||||
Expect(result).To(HaveLen(1))
|
||||
Expect(result[0].Message.Content).ToNot(BeNil())
|
||||
Expect(*(result[0].Message.Content.(*string))).To(Equal("hello"),
|
||||
"a complete answer with no closing reasoning tag must stay in content")
|
||||
Expect(result[0].Message.Reasoning).To(BeNil(),
|
||||
"no reasoning block was emitted, so Reasoning must not be set")
|
||||
})
|
||||
|
||||
It("keeps a tag-less JSON answer as content (the summary case)", func() {
|
||||
raw := `{"short":"Tests pass","long":"go test ./... succeeded."}`
|
||||
chatDeltas := []*pb.ChatDelta{
|
||||
{Content: raw, ReasoningContent: ""},
|
||||
}
|
||||
|
||||
result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
|
||||
|
||||
Expect(result).To(HaveLen(1))
|
||||
Expect(*(result[0].Message.Content.(*string))).To(Equal(raw))
|
||||
Expect(result[0].Message.Reasoning).To(BeNil())
|
||||
})
|
||||
|
||||
It("still splits reasoning when the model emits the closing tag (prefill paired with </think>)", func() {
|
||||
// The legitimate prefill case: <think> was in the prompt, so the
|
||||
// output carries only the closing tag. The closing tag is the proof
|
||||
// that a reasoning block exists, so extraction must run.
|
||||
raw := "The user wants a greeting.\n</think>\n\nHello there!"
|
||||
chatDeltas := []*pb.ChatDelta{
|
||||
{Content: raw, ReasoningContent: ""},
|
||||
}
|
||||
|
||||
result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
|
||||
|
||||
Expect(result).To(HaveLen(1))
|
||||
content := *(result[0].Message.Content.(*string))
|
||||
Expect(content).To(ContainSubstring("Hello there!"))
|
||||
Expect(content).ToNot(ContainSubstring("</think>"))
|
||||
Expect(content).ToNot(ContainSubstring("The user wants a greeting"))
|
||||
Expect(result[0].Message.Reasoning).ToNot(BeNil())
|
||||
Expect(*result[0].Message.Reasoning).To(ContainSubstring("The user wants a greeting"))
|
||||
})
|
||||
|
||||
It("still splits a fully-tagged <think>…</think> block with a prefill token set", func() {
|
||||
raw := "<think>Reasoning here.</think>Final answer."
|
||||
chatDeltas := []*pb.ChatDelta{
|
||||
{Content: raw, ReasoningContent: ""},
|
||||
}
|
||||
|
||||
result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
|
||||
|
||||
Expect(result).To(HaveLen(1))
|
||||
Expect(*(result[0].Message.Content.(*string))).To(Equal("Final answer."))
|
||||
Expect(result[0].Message.Reasoning).ToNot(BeNil())
|
||||
Expect(*result[0].Message.Reasoning).To(ContainSubstring("Reasoning here"))
|
||||
})
|
||||
|
||||
// End-to-end regression for the real production failure: a request with
|
||||
// enable_thinking=false against a <think>-capable model (qwen3 family).
|
||||
//
|
||||
// In non-thinking mode the model emits no reasoning block, so llama.cpp's
|
||||
// autoparser correctly returns ChatDeltas with Content set and
|
||||
// ReasoningContent EMPTY (verified against stock llama-server: the same
|
||||
// model with chat_template_kwargs.enable_thinking=false returns
|
||||
// reasoning_content=null and content="hello"). But thinkingStartToken is
|
||||
// detected per-model from the enable_thinking=TRUE render
|
||||
// (grpc-server renders with enable_thinking=true; DetectThinkingStartToken
|
||||
// does not evaluate the jinja {% if enable_thinking %} conditional), so it
|
||||
// is "<think>" even for this non-thinking request. The old code prepended
|
||||
// it and swallowed the answer. This is the case that broke session
|
||||
// summaries and auto-titles and was NOT covered before.
|
||||
It("preserves content for a non-thinking-mode request (enable_thinking=false, empty reasoning_content)", func() {
|
||||
// What llama.cpp's autoparser actually returns in non-thinking mode.
|
||||
chatDeltas := []*pb.ChatDelta{
|
||||
{Content: `{"short":"Go tests passed for internal/session"}`, ReasoningContent: ""},
|
||||
}
|
||||
|
||||
result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
|
||||
|
||||
Expect(result).To(HaveLen(1))
|
||||
Expect(*(result[0].Message.Content.(*string))).To(Equal(`{"short":"Go tests passed for internal/session"}`),
|
||||
"non-thinking-mode answers must reach the client intact, not be swallowed as reasoning")
|
||||
Expect(result[0].Message.Reasoning).To(BeNil())
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("mergeToolCallDeltas", func() {
|
||||
|
||||
@@ -1579,7 +1579,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
|
||||
// ExtractReasoningWithConfig is a no-op when no tag pair matches,
|
||||
// so it's safe to apply unconditionally in the no-reasoning branch.
|
||||
if deltaReasoning == "" && deltaContent != "" {
|
||||
deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
|
||||
deltaReasoning, deltaContent = reasoning.ExtractReasoningComplete(deltaContent, thinkingStartToken, config.ReasoningConfig)
|
||||
}
|
||||
reasoningText = deltaReasoning
|
||||
responseWithoutReasoning = deltaContent
|
||||
@@ -1587,7 +1587,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
|
||||
cleanedResponse = deltaContent
|
||||
toolCalls = deltaToolCalls
|
||||
} else {
|
||||
reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
|
||||
reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningComplete(rawResponse, thinkingStartToken, config.ReasoningConfig)
|
||||
textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
|
||||
cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
|
||||
toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
|
||||
|
||||
@@ -1356,7 +1356,7 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
|
||||
thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)
|
||||
|
||||
// Extract reasoning from result before cleaning
|
||||
reasoningContent, cleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
|
||||
reasoningContent, cleanedResult := reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig)
|
||||
|
||||
// Parse tool calls if using functions
|
||||
var outputItems []schema.ORItemField
|
||||
@@ -1996,7 +1996,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
finalCleanedResult = extractor.CleanedContent()
|
||||
}
|
||||
if finalReasoning == "" && finalCleanedResult == "" {
|
||||
finalReasoning, finalCleanedResult = reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
|
||||
finalReasoning, finalCleanedResult = reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig)
|
||||
}
|
||||
|
||||
// Close reasoning item if it exists and wasn't closed yet
|
||||
@@ -2493,7 +2493,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
|
||||
finalCleanedResult = extractor.CleanedContent()
|
||||
}
|
||||
if finalReasoning == "" && finalCleanedResult == "" {
|
||||
finalReasoning, finalCleanedResult = reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
|
||||
finalReasoning, finalCleanedResult = reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig)
|
||||
}
|
||||
|
||||
// Close reasoning item if it exists and wasn't closed yet
|
||||
|
||||
6
core/http/react-ui/src/hooks/useChat.js
vendored
6
core/http/react-ui/src/hooks/useChat.js
vendored
@@ -216,6 +216,12 @@ export function useChat(initialModel = '') {
|
||||
audio_url: { url: `data:${file.type};base64,${file.base64}` },
|
||||
})
|
||||
userFiles.push({ name: file.name, type: 'audio' })
|
||||
} else if (file.type?.startsWith('video/')) {
|
||||
messageContent.push({
|
||||
type: 'video_url',
|
||||
video_url: { url: `data:${file.type};base64,${file.base64}` },
|
||||
})
|
||||
userFiles.push({ name: file.name, type: 'video' })
|
||||
} else {
|
||||
// Text/PDF files - append to content
|
||||
if (file.textContent) {
|
||||
|
||||
@@ -265,7 +265,7 @@ function UserMessageContent({ content, files }) {
|
||||
<div className="chat-message-files">
|
||||
{files.map((f, i) => (
|
||||
<span key={i} className="chat-file-inline">
|
||||
<i className={`fas ${f.type === 'image' ? 'fa-image' : f.type === 'audio' ? 'fa-headphones' : 'fa-file'}`} />
|
||||
<i className={`fas ${f.type === 'image' ? 'fa-image' : f.type === 'audio' ? 'fa-headphones' : f.type === 'video' ? 'fa-film' : 'fa-file'}`} />
|
||||
{f.name}
|
||||
</span>
|
||||
))}
|
||||
@@ -274,6 +274,9 @@ function UserMessageContent({ content, files }) {
|
||||
{Array.isArray(content) && content.filter(c => c.type === 'image_url').map((img, i) => (
|
||||
<img key={i} src={img.image_url.url} alt="attached" className="chat-inline-image" />
|
||||
))}
|
||||
{Array.isArray(content) && content.filter(c => c.type === 'video_url').map((vid, i) => (
|
||||
<video key={i} src={vid.video_url.url} controls className="chat-inline-video" />
|
||||
))}
|
||||
</>
|
||||
)
|
||||
}
|
||||
@@ -711,7 +714,7 @@ export default function Chat() {
|
||||
for (const file of e.target.files) {
|
||||
const base64 = await fileToBase64(file)
|
||||
const entry = { name: file.name, type: file.type, base64 }
|
||||
if (!file.type.startsWith('image/') && !file.type.startsWith('audio/')) {
|
||||
if (!file.type.startsWith('image/') && !file.type.startsWith('audio/') && !file.type.startsWith('video/')) {
|
||||
entry.textContent = await file.text().catch(() => '')
|
||||
}
|
||||
newFiles.push(entry)
|
||||
@@ -1244,7 +1247,7 @@ export default function Chat() {
|
||||
<div className="chat-files">
|
||||
{files.map((f, i) => (
|
||||
<span key={i} className="chat-file-badge">
|
||||
<i className={`fas ${f.type?.startsWith('image/') ? 'fa-image' : f.type?.startsWith('audio/') ? 'fa-headphones' : 'fa-file'}`} />
|
||||
<i className={`fas ${f.type?.startsWith('image/') ? 'fa-image' : f.type?.startsWith('audio/') ? 'fa-headphones' : f.type?.startsWith('video/') ? 'fa-film' : 'fa-file'}`} />
|
||||
{f.name}
|
||||
<button onClick={() => setFiles(prev => prev.filter((_, idx) => idx !== i))}>
|
||||
<i className="fas fa-xmark" />
|
||||
@@ -1343,7 +1346,7 @@ export default function Chat() {
|
||||
ref={fileInputRef}
|
||||
type="file"
|
||||
multiple
|
||||
accept="image/*,audio/*,application/pdf,.txt,.md,.csv,.json"
|
||||
accept="image/*,audio/*,video/*,application/pdf,.txt,.md,.csv,.json"
|
||||
style={{ display: 'none' }}
|
||||
onChange={handleFileChange}
|
||||
/>
|
||||
|
||||
@@ -89,6 +89,35 @@ func ExtractReasoningWithConfig(content, thinkingStartToken string, config Confi
|
||||
return reasoning, cleanedContent
|
||||
}
|
||||
|
||||
// ExtractReasoningComplete extracts reasoning from a COMPLETE (non-streaming)
|
||||
// model response. It behaves like ExtractReasoningWithConfig except that it only
|
||||
// honors a prefilled thinking start token when the response actually contains
|
||||
// the matching closing tag.
|
||||
//
|
||||
// Rationale: when a chat template injects the start token into the prompt (so
|
||||
// DetectThinkingStartToken returns e.g. "<think>"), the model's output begins
|
||||
// inside a reasoning block and carries only the closing tag. The defensive
|
||||
// fallback prepends the start token so the extractor can pair it with that
|
||||
// close tag. But on a COMPLETE response with no closing tag, the model answered
|
||||
// directly with no reasoning at all — prepending the start token would
|
||||
// manufacture an unclosed block that swallows the entire answer into reasoning,
|
||||
// leaving content empty (breaking short/direct answers such as session names or
|
||||
// JSON summaries). Genuine reasoning tags already present in the content still
|
||||
// extract, because dropping the synthetic prefill does not affect them.
|
||||
//
|
||||
// Streaming callers must keep using ExtractReasoningWithConfig: mid-stream an
|
||||
// as-yet-unclosed block is legitimate and its tokens should surface as
|
||||
// reasoning deltas as they arrive.
|
||||
func ExtractReasoningComplete(content, thinkingStartToken string, config Config) (reasoning string, cleanedContent string) {
|
||||
startToken := thinkingStartToken
|
||||
if startToken != "" {
|
||||
if end := ClosingTokenForStart(startToken, &config); end == "" || !strings.Contains(content, end) {
|
||||
startToken = ""
|
||||
}
|
||||
}
|
||||
return ExtractReasoningWithConfig(content, startToken, config)
|
||||
}
|
||||
|
||||
// PrependThinkingTokenIfNeeded prepends the thinking start token to content if it was
|
||||
// detected in the prompt. This allows the standard extraction logic to work correctly
|
||||
// for models where the thinking token is already in the prompt.
|
||||
@@ -131,6 +160,48 @@ func PrependThinkingTokenIfNeeded(content string, startToken string) string {
|
||||
return startToken + content
|
||||
}
|
||||
|
||||
// defaultReasoningTagPairs are the built-in start/end reasoning tag pairs,
|
||||
// matching llama.cpp's chat-parser.cpp. Kept at package scope so that
|
||||
// ExtractReasoning and ClosingTokenForStart share a single source of truth.
|
||||
var defaultReasoningTagPairs = []TagPair{
|
||||
{Start: "<|START_THINKING|>", End: "<|END_THINKING|>"}, // Command-R models
|
||||
{Start: "<|inner_prefix|>", End: "<|inner_suffix|>"}, // Apertus models
|
||||
{Start: "<seed:think>", End: "</seed:think>"}, // Seed models
|
||||
{Start: "<think>", End: "</think>"}, // DeepSeek, Granite, ExaOne models
|
||||
{Start: "<|think|>", End: "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end)
|
||||
{Start: "<|channel>thought", End: "<channel|>"}, // Gemma 4 models
|
||||
{Start: "<thinking>", End: "</thinking>"}, // General thinking tag
|
||||
{Start: "[THINK]", End: "[/THINK]"}, // Magistral models
|
||||
}
|
||||
|
||||
// ClosingTokenForStart returns the closing reasoning tag that pairs with the
|
||||
// given start token, searching custom config TagPairs first then the built-in
|
||||
// defaults. Returns "" when startToken is empty or unrecognized.
|
||||
//
|
||||
// Used by the non-streaming autoparser fallback to decide whether a complete
|
||||
// response that began with a prefilled thinking token actually closed its
|
||||
// reasoning block: only then is synthesizing the start token (so the standard
|
||||
// extractor can pair it with the model's close tag) safe. A complete response
|
||||
// with no closing tag is a direct answer, not unclosed reasoning.
|
||||
func ClosingTokenForStart(startToken string, config *Config) string {
|
||||
if startToken == "" {
|
||||
return ""
|
||||
}
|
||||
if config != nil {
|
||||
for _, pair := range config.TagPairs {
|
||||
if pair.Start == startToken {
|
||||
return pair.End
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, pair := range defaultReasoningTagPairs {
|
||||
if pair.Start == startToken {
|
||||
return pair.End
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// ExtractReasoning extracts reasoning content from thinking tags and returns
|
||||
// both the extracted reasoning and the cleaned content (with tags removed).
|
||||
// It handles <thinking>...</thinking> and <think>...</think> tags.
|
||||
@@ -145,22 +216,7 @@ func ExtractReasoning(content string, config *Config) (reasoning string, cleaned
|
||||
var cleanedParts []string
|
||||
remaining := content
|
||||
|
||||
// Define default tag pairs to look for (matching llama.cpp's chat-parser.cpp)
|
||||
defaultTagPairs := []struct {
|
||||
start string
|
||||
end string
|
||||
}{
|
||||
{"<|START_THINKING|>", "<|END_THINKING|>"}, // Command-R models
|
||||
{"<|inner_prefix|>", "<|inner_suffix|>"}, // Apertus models
|
||||
{"<seed:think>", "</seed:think>"}, // Seed models
|
||||
{"<think>", "</think>"}, // DeepSeek, Granite, ExaOne models
|
||||
{"<|think|>", "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end)
|
||||
{"<|channel>thought", "<channel|>"}, // Gemma 4 models
|
||||
{"<thinking>", "</thinking>"}, // General thinking tag
|
||||
{"[THINK]", "[/THINK]"}, // Magistral models
|
||||
}
|
||||
|
||||
// Merge custom tag pairs with default tag pairs (custom pairs first for priority)
|
||||
// Merge custom tag pairs (highest priority) with the built-in defaults.
|
||||
var tagPairs []struct {
|
||||
start string
|
||||
end string
|
||||
@@ -175,9 +231,11 @@ func ExtractReasoning(content string, config *Config) (reasoning string, cleaned
|
||||
}
|
||||
}
|
||||
}
|
||||
// Add default tag pairs
|
||||
for _, pair := range defaultTagPairs {
|
||||
tagPairs = append(tagPairs, pair)
|
||||
for _, pair := range defaultReasoningTagPairs {
|
||||
tagPairs = append(tagPairs, struct {
|
||||
start string
|
||||
end string
|
||||
}{pair.Start, pair.End})
|
||||
}
|
||||
|
||||
// Track the last position we've processed
|
||||
|
||||
@@ -1175,6 +1175,55 @@ var _ = Describe("Custom Tokens and Tag Pairs Integration", func() {
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("ClosingTokenForStart", func() {
|
||||
It("returns the default closing tag for a known start token", func() {
|
||||
Expect(ClosingTokenForStart("<think>", nil)).To(Equal("</think>"))
|
||||
Expect(ClosingTokenForStart("<thinking>", nil)).To(Equal("</thinking>"))
|
||||
Expect(ClosingTokenForStart("[THINK]", nil)).To(Equal("[/THINK]"))
|
||||
})
|
||||
|
||||
It("returns empty for an empty or unknown start token", func() {
|
||||
Expect(ClosingTokenForStart("", nil)).To(BeEmpty())
|
||||
Expect(ClosingTokenForStart("<nope>", nil)).To(BeEmpty())
|
||||
})
|
||||
|
||||
It("prefers custom config tag pairs over the defaults", func() {
|
||||
cfg := &Config{TagPairs: []TagPair{{Start: "<think>", End: "<<END>>"}}}
|
||||
Expect(ClosingTokenForStart("<think>", cfg)).To(Equal("<<END>>"))
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("ExtractReasoningComplete", func() {
|
||||
const startToken = "<think>"
|
||||
|
||||
It("keeps a tag-less answer as content when a start token is prefilled but no close tag is present", func() {
|
||||
// The bug guard: prompt-prefilled <think>, model answered directly with
|
||||
// no reasoning. The synthetic prefill must not swallow it as reasoning.
|
||||
reasoning, content := ExtractReasoningComplete("hello", startToken, Config{})
|
||||
Expect(reasoning).To(BeEmpty())
|
||||
Expect(content).To(Equal("hello"))
|
||||
})
|
||||
|
||||
It("extracts reasoning when the model emits only the closing tag (legitimate prefill)", func() {
|
||||
reasoning, content := ExtractReasoningComplete("the rationale\n</think>\n\nthe answer", startToken, Config{})
|
||||
Expect(reasoning).To(ContainSubstring("the rationale"))
|
||||
Expect(content).To(ContainSubstring("the answer"))
|
||||
Expect(content).ToNot(ContainSubstring("</think>"))
|
||||
})
|
||||
|
||||
It("extracts a fully-tagged block regardless of the prefill token", func() {
|
||||
reasoning, content := ExtractReasoningComplete("<think>r</think>answer", startToken, Config{})
|
||||
Expect(reasoning).To(Equal("r"))
|
||||
Expect(content).To(Equal("answer"))
|
||||
})
|
||||
|
||||
It("behaves like ExtractReasoningWithConfig when no start token is prefilled", func() {
|
||||
reasoning, content := ExtractReasoningComplete("<think>r</think>answer", "", Config{})
|
||||
Expect(reasoning).To(Equal("r"))
|
||||
Expect(content).To(Equal("answer"))
|
||||
})
|
||||
})
|
||||
|
||||
// Helper function to create bool pointers for test configs
|
||||
func boolPtr(b bool) *bool {
|
||||
return &b
|
||||
|
||||
Reference in New Issue
Block a user